]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/regextst.cpp
ICU-57131.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
1 /********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2016, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7 //
8 // regextst.cpp
9 //
10 // ICU Regular Expressions test, part of intltest.
11 //
12
13 /*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29
30 #include "unicode/localpointer.h"
31 #include "unicode/regex.h"
32 #include "unicode/uchar.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uregex.h"
36 #include "unicode/usetiter.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39
40 #include "regextst.h"
41 #include "regexcmp.h"
42 #include "uvector.h"
43 #include "util.h"
44 #include "cmemory.h"
45 #include "cstring.h"
46 #include "uinvchar.h"
47
48 #define SUPPORT_MUTATING_INPUT_STRING 0
49
50 //---------------------------------------------------------------------------
51 //
52 // Test class boilerplate
53 //
54 //---------------------------------------------------------------------------
55 RegexTest::RegexTest()
56 {
57 }
58
59
60 RegexTest::~RegexTest()
61 {
62 }
63
64
65
66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
67 {
68 if (exec) logln("TestSuite RegexTest: ");
69 switch (index) {
70
71 case 0: name = "Basic";
72 if (exec) Basic();
73 break;
74 case 1: name = "API_Match";
75 if (exec) API_Match();
76 break;
77 case 2: name = "API_Replace";
78 if (exec) API_Replace();
79 break;
80 case 3: name = "API_Pattern";
81 if (exec) API_Pattern();
82 break;
83 case 4:
84 #if !UCONFIG_NO_FILE_IO
85 name = "Extended";
86 if (exec) Extended();
87 #else
88 name = "skip";
89 #endif
90 break;
91 case 5: name = "Errors";
92 if (exec) Errors();
93 break;
94 case 6: name = "PerlTests";
95 if (exec) PerlTests();
96 break;
97 case 7: name = "Callbacks";
98 if (exec) Callbacks();
99 break;
100 case 8: name = "FindProgressCallbacks";
101 if (exec) FindProgressCallbacks();
102 break;
103 case 9: name = "Bug 6149";
104 if (exec) Bug6149();
105 break;
106 case 10: name = "UTextBasic";
107 if (exec) UTextBasic();
108 break;
109 case 11: name = "API_Match_UTF8";
110 if (exec) API_Match_UTF8();
111 break;
112 case 12: name = "API_Replace_UTF8";
113 if (exec) API_Replace_UTF8();
114 break;
115 case 13: name = "API_Pattern_UTF8";
116 if (exec) API_Pattern_UTF8();
117 break;
118 case 14: name = "PerlTestsUTF8";
119 if (exec) PerlTestsUTF8();
120 break;
121 case 15: name = "PreAllocatedUTextCAPI";
122 if (exec) PreAllocatedUTextCAPI();
123 break;
124 case 16: name = "Bug 7651";
125 if (exec) Bug7651();
126 break;
127 case 17: name = "Bug 7740";
128 if (exec) Bug7740();
129 break;
130 case 18: name = "Bug 8479";
131 if (exec) Bug8479();
132 break;
133 case 19: name = "Bug 7029";
134 if (exec) Bug7029();
135 break;
136 case 20: name = "CheckInvBufSize";
137 if (exec) CheckInvBufSize();
138 break;
139 case 21: name = "Bug 9283";
140 if (exec) Bug9283();
141 break;
142 case 22: name = "Bug10459";
143 if (exec) Bug10459();
144 break;
145 case 23: name = "TestCaseInsensitiveStarters";
146 if (exec) TestCaseInsensitiveStarters();
147 break;
148 case 24: name = "TestBug11049";
149 if (exec) TestBug11049();
150 break;
151 case 25: name = "TestBug11371";
152 if (exec) TestBug11371();
153 break;
154 case 26: name = "TestBug11480";
155 if (exec) TestBug11480();
156 break;
157 case 27: name = "NamedCapture";
158 if (exec) NamedCapture();
159 break;
160 case 28: name = "NamedCaptureLimits";
161 if (exec) NamedCaptureLimits();
162 break;
163 default: name = "";
164 break; //needed to end loop
165 }
166 }
167
168
169
170 /**
171 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
172 * into ASCII.
173 * @see utext_openUTF8
174 */
175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
176
177 //---------------------------------------------------------------------------
178 //
179 // Error Checking / Reporting macros used in all of the tests.
180 //
181 //---------------------------------------------------------------------------
182
183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
184 int64_t oldIndex = utext_getNativeIndex(text);
185 utext_setNativeIndex(text, 0);
186 char *bufPtr = buf;
187 UChar32 c = utext_next32From(text, 0);
188 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
189 if (0x000020<=c && c<0x00007e) {
190 *bufPtr = c;
191 } else {
192 #if 0
193 sprintf(bufPtr,"U+%04X", c);
194 bufPtr+= strlen(bufPtr)-1;
195 #else
196 *bufPtr = '%';
197 #endif
198 }
199 bufPtr++;
200 c = UTEXT_NEXT32(text);
201 }
202 *bufPtr = 0;
203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
204 char *ebuf = (char*)malloc(bufLen);
205 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
206 uprv_strncpy(buf, ebuf, bufLen);
207 free((void*)ebuf);
208 #endif
209 utext_setNativeIndex(text, oldIndex);
210 }
211
212
213 static char ASSERT_BUF[1024];
214
215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
216 if(message.length()==0) {
217 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
218 } else {
219 UnicodeString buf;
220 IntlTest::prettify(message,buf);
221 if(buf.length()==0) {
222 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
223 } else {
224 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
225 if(ASSERT_BUF[0]==0) {
226 ASSERT_BUF[0]=0;
227 for(int32_t i=0;i<buf.length();i++) {
228 UChar ch = buf[i];
229 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
230 }
231 }
232 }
233 }
234 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
235 return ASSERT_BUF;
236 }
237
238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
239
240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
241 __FILE__, __LINE__, u_errorName(status)); return;}}
242
243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
244
245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
246 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
247 __LINE__, u_errorName(errcode), u_errorName(status));};}
248
249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
250 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
251
252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
253 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
254
255 // expected: const char * , restricted to invariant characters.
256 // actual: const UnicodeString &
257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
258 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
260 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
261
262
263 static UBool testUTextEqual(UText *uta, UText *utb) {
264 UChar32 ca = 0;
265 UChar32 cb = 0;
266 utext_setNativeIndex(uta, 0);
267 utext_setNativeIndex(utb, 0);
268 do {
269 ca = utext_next32(uta);
270 cb = utext_next32(utb);
271 if (ca != cb) {
272 break;
273 }
274 } while (ca != U_SENTINEL);
275 return ca == cb;
276 }
277
278
279 /**
280 * @param expected expected text in UTF-8 (not platform) codepage
281 */
282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
283 UErrorCode status = U_ZERO_ERROR;
284 UText expectedText = UTEXT_INITIALIZER;
285 utext_openUTF8(&expectedText, expected, -1, &status);
286 if(U_FAILURE(status)) {
287 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
288 return;
289 }
290 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
291 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
292 return;
293 }
294 utext_setNativeIndex(actual, 0);
295 if (!testUTextEqual(&expectedText, actual)) {
296 char buf[201 /*21*/];
297 char expectedBuf[201];
298 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
299 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
300 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
301 }
302 utext_close(&expectedText);
303 }
304 /**
305 * @param expected invariant (platform local text) input
306 */
307
308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
309 UErrorCode status = U_ZERO_ERROR;
310 UText expectedText = UTEXT_INITIALIZER;
311 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
312 if(U_FAILURE(status)) {
313 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
314 return;
315 }
316 utext_setNativeIndex(actual, 0);
317 if (!testUTextEqual(&expectedText, actual)) {
318 char buf[201 /*21*/];
319 char expectedBuf[201];
320 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
321 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
322 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
323 }
324 utext_close(&expectedText);
325 }
326
327 /**
328 * Assumes utf-8 input
329 */
330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
331 /**
332 * Assumes Invariant input
333 */
334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
335
336 /**
337 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
338 * passed into utext_openUTF8. An error will be given if
339 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
340 */
341
342 #define INV_BUFSIZ 2048 /* increase this if too small */
343
344 static int64_t inv_next=0;
345
346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
347 static char inv_buf[INV_BUFSIZ];
348 #endif
349
350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
351 if(length==-1) length=strlen(inv);
352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
353 inv_next+=length;
354 return utext_openUTF8(ut, inv, length, status);
355 #else
356 if(inv_next+length+1>INV_BUFSIZ) {
357 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
358 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
359 *status = U_MEMORY_ALLOCATION_ERROR;
360 return NULL;
361 }
362
363 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
364 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
365 inv_next+=length;
366
367 #if 0
368 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
369 #endif
370
371 return utext_openUTF8(ut, (const char*)buf, length, status);
372 #endif
373 }
374
375
376 //---------------------------------------------------------------------------
377 //
378 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
379 // for the LookingAt() and Match() functions.
380 //
381 // usage:
382 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
383 //
384 // The expected results are UBool - TRUE or FALSE.
385 // The input text is unescaped. The pattern is not.
386 //
387 //
388 //---------------------------------------------------------------------------
389
390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
391
392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
393 const UnicodeString pattern(pat, -1, US_INV);
394 const UnicodeString inputText(text, -1, US_INV);
395 UErrorCode status = U_ZERO_ERROR;
396 UParseError pe;
397 RegexPattern *REPattern = NULL;
398 RegexMatcher *REMatcher = NULL;
399 UBool retVal = TRUE;
400
401 UnicodeString patString(pat, -1, US_INV);
402 REPattern = RegexPattern::compile(patString, 0, pe, status);
403 if (U_FAILURE(status)) {
404 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
405 line, u_errorName(status));
406 return FALSE;
407 }
408 if (line==376) { REPattern->dumpPattern();}
409
410 UnicodeString inputString(inputText);
411 UnicodeString unEscapedInput = inputString.unescape();
412 REMatcher = REPattern->matcher(unEscapedInput, status);
413 if (U_FAILURE(status)) {
414 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
415 line, u_errorName(status));
416 return FALSE;
417 }
418
419 UBool actualmatch;
420 actualmatch = REMatcher->lookingAt(status);
421 if (U_FAILURE(status)) {
422 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
423 line, u_errorName(status));
424 retVal = FALSE;
425 }
426 if (actualmatch != looking) {
427 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
428 retVal = FALSE;
429 }
430
431 status = U_ZERO_ERROR;
432 actualmatch = REMatcher->matches(status);
433 if (U_FAILURE(status)) {
434 errln("RegexTest failure in matches() at line %d. Status = %s\n",
435 line, u_errorName(status));
436 retVal = FALSE;
437 }
438 if (actualmatch != match) {
439 errln("RegexTest: wrong return from matches() at line %d.\n", line);
440 retVal = FALSE;
441 }
442
443 if (retVal == FALSE) {
444 REPattern->dumpPattern();
445 }
446
447 delete REPattern;
448 delete REMatcher;
449 return retVal;
450 }
451
452
453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
454 UText pattern = UTEXT_INITIALIZER;
455 int32_t inputUTF8Length;
456 char *textChars = NULL;
457 UText inputText = UTEXT_INITIALIZER;
458 UErrorCode status = U_ZERO_ERROR;
459 UParseError pe;
460 RegexPattern *REPattern = NULL;
461 RegexMatcher *REMatcher = NULL;
462 UBool retVal = TRUE;
463
464 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
465 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
466 if (U_FAILURE(status)) {
467 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
468 line, u_errorName(status));
469 return FALSE;
470 }
471
472 UnicodeString inputString(text, -1, US_INV);
473 UnicodeString unEscapedInput = inputString.unescape();
474 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
475 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
476
477 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
478 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
479 // UTF-8 does not allow unpaired surrogates, so this could actually happen
480 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
481 return TRUE; // not a failure of the Regex engine
482 }
483 status = U_ZERO_ERROR; // buffer overflow
484 textChars = new char[inputUTF8Length+1];
485 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
486 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
487
488 REMatcher = &REPattern->matcher(status)->reset(&inputText);
489 if (U_FAILURE(status)) {
490 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
491 line, u_errorName(status));
492 return FALSE;
493 }
494
495 UBool actualmatch;
496 actualmatch = REMatcher->lookingAt(status);
497 if (U_FAILURE(status)) {
498 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
499 line, u_errorName(status));
500 retVal = FALSE;
501 }
502 if (actualmatch != looking) {
503 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
504 retVal = FALSE;
505 }
506
507 status = U_ZERO_ERROR;
508 actualmatch = REMatcher->matches(status);
509 if (U_FAILURE(status)) {
510 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
511 line, u_errorName(status));
512 retVal = FALSE;
513 }
514 if (actualmatch != match) {
515 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
516 retVal = FALSE;
517 }
518
519 if (retVal == FALSE) {
520 REPattern->dumpPattern();
521 }
522
523 delete REPattern;
524 delete REMatcher;
525 utext_close(&inputText);
526 utext_close(&pattern);
527 delete[] textChars;
528 return retVal;
529 }
530
531
532
533 //---------------------------------------------------------------------------
534 //
535 // REGEX_ERR Macro + invocation function to simplify writing tests
536 // regex tests for incorrect patterns
537 //
538 // usage:
539 // REGEX_ERR("pattern", expected error line, column, expected status);
540 //
541 //---------------------------------------------------------------------------
542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
543
544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
545 UErrorCode expectedStatus, int32_t line) {
546 UnicodeString pattern(pat);
547
548 UErrorCode status = U_ZERO_ERROR;
549 UParseError pe;
550 RegexPattern *callerPattern = NULL;
551
552 //
553 // Compile the caller's pattern
554 //
555 UnicodeString patString(pat);
556 callerPattern = RegexPattern::compile(patString, 0, pe, status);
557 if (status != expectedStatus) {
558 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
559 } else {
560 if (status != U_ZERO_ERROR) {
561 if (pe.line != errLine || pe.offset != errCol) {
562 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
563 line, errLine, errCol, pe.line, pe.offset);
564 }
565 }
566 }
567
568 delete callerPattern;
569
570 //
571 // Compile again, using a UTF-8-based UText
572 //
573 UText patternText = UTEXT_INITIALIZER;
574 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
575 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
576 if (status != expectedStatus) {
577 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
578 } else {
579 if (status != U_ZERO_ERROR) {
580 if (pe.line != errLine || pe.offset != errCol) {
581 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
582 line, errLine, errCol, pe.line, pe.offset);
583 }
584 }
585 }
586
587 delete callerPattern;
588 utext_close(&patternText);
589 }
590
591
592
593 //---------------------------------------------------------------------------
594 //
595 // Basic Check for basic functionality of regex pattern matching.
596 // Avoid the use of REGEX_FIND test macro, which has
597 // substantial dependencies on basic Regex functionality.
598 //
599 //---------------------------------------------------------------------------
600 void RegexTest::Basic() {
601
602
603 //
604 // Debug - slide failing test cases early
605 //
606 #if 0
607 {
608 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
609 UParseError pe;
610 UErrorCode status = U_ZERO_ERROR;
611 RegexPattern *pattern;
612 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
613 pattern->dumpPattern();
614 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
615 UBool result = m->find();
616 printf("result = %d\n", result);
617 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
618 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
619 }
620 exit(1);
621 #endif
622
623
624 //
625 // Pattern with parentheses
626 //
627 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
628 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
629 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
630
631 //
632 // Patterns with *
633 //
634 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
635 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
636 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
637 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
638 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
639
640 REGEX_TESTLM("a*", "", TRUE, TRUE);
641 REGEX_TESTLM("a*", "b", TRUE, FALSE);
642
643
644 //
645 // Patterns with "."
646 //
647 REGEX_TESTLM(".", "abc", TRUE, FALSE);
648 REGEX_TESTLM("...", "abc", TRUE, TRUE);
649 REGEX_TESTLM("....", "abc", FALSE, FALSE);
650 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
651 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
652 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
653 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
654 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
655
656 //
657 // Patterns with * applied to chars at end of literal string
658 //
659 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
660 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
661
662 //
663 // Supplemental chars match as single chars, not a pair of surrogates.
664 //
665 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
666 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
667 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
668
669
670 //
671 // UnicodeSets in the pattern
672 //
673 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
674 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
675 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
676 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
677 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
678 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
679
680 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
681 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
682 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
683 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
684 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
685
686 //
687 // OR operator in patterns
688 //
689 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
690 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
691 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
692 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
693
694 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
695 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
696 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
697 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
698 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
699 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
700
701 //
702 // +
703 //
704 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
705 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
706 REGEX_TESTLM("b+", "", FALSE, FALSE);
707 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
708 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
709 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
710
711 //
712 // ?
713 //
714 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
715 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
716 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
717 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
718 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
719 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
720 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
721 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
722 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
723
724 //
725 // Escape sequences that become single literal chars, handled internally
726 // by ICU's Unescape.
727 //
728
729 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
730 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
731 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
732 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
733 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
734 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
735 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
736 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
737 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
738 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
739
740 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
741 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
742
743 // Escape of special chars in patterns
744 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
745 }
746
747
748 //---------------------------------------------------------------------------
749 //
750 // UTextBasic Check for quirks that are specific to the UText
751 // implementation.
752 //
753 //---------------------------------------------------------------------------
754 void RegexTest::UTextBasic() {
755 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
756 UErrorCode status = U_ZERO_ERROR;
757 UText pattern = UTEXT_INITIALIZER;
758 utext_openUTF8(&pattern, str_abc, -1, &status);
759 RegexMatcher matcher(&pattern, 0, status);
760 REGEX_CHECK_STATUS;
761
762 UText input = UTEXT_INITIALIZER;
763 utext_openUTF8(&input, str_abc, -1, &status);
764 REGEX_CHECK_STATUS;
765 matcher.reset(&input);
766 REGEX_CHECK_STATUS;
767 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
768
769 matcher.reset(matcher.inputText());
770 REGEX_CHECK_STATUS;
771 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
772
773 utext_close(&pattern);
774 utext_close(&input);
775 }
776
777
778 //---------------------------------------------------------------------------
779 //
780 // API_Match Test that the API for class RegexMatcher
781 // is present and nominally working, but excluding functions
782 // implementing replace operations.
783 //
784 //---------------------------------------------------------------------------
785 void RegexTest::API_Match() {
786 UParseError pe;
787 UErrorCode status=U_ZERO_ERROR;
788 int32_t flags = 0;
789
790 //
791 // Debug - slide failing test cases early
792 //
793 #if 0
794 {
795 }
796 return;
797 #endif
798
799 //
800 // Simple pattern compilation
801 //
802 {
803 UnicodeString re("abc");
804 RegexPattern *pat2;
805 pat2 = RegexPattern::compile(re, flags, pe, status);
806 REGEX_CHECK_STATUS;
807
808 UnicodeString inStr1 = "abcdef this is a test";
809 UnicodeString instr2 = "not abc";
810 UnicodeString empty = "";
811
812
813 //
814 // Matcher creation and reset.
815 //
816 RegexMatcher *m1 = pat2->matcher(inStr1, status);
817 REGEX_CHECK_STATUS;
818 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
819 REGEX_ASSERT(m1->input() == inStr1);
820 m1->reset(instr2);
821 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
822 REGEX_ASSERT(m1->input() == instr2);
823 m1->reset(inStr1);
824 REGEX_ASSERT(m1->input() == inStr1);
825 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
826 m1->reset(empty);
827 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
828 REGEX_ASSERT(m1->input() == empty);
829 REGEX_ASSERT(&m1->pattern() == pat2);
830
831 //
832 // reset(pos, status)
833 //
834 m1->reset(inStr1);
835 m1->reset(4, status);
836 REGEX_CHECK_STATUS;
837 REGEX_ASSERT(m1->input() == inStr1);
838 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
839
840 m1->reset(-1, status);
841 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
842 status = U_ZERO_ERROR;
843
844 m1->reset(0, status);
845 REGEX_CHECK_STATUS;
846 status = U_ZERO_ERROR;
847
848 int32_t len = m1->input().length();
849 m1->reset(len-1, status);
850 REGEX_CHECK_STATUS;
851 status = U_ZERO_ERROR;
852
853 m1->reset(len, status);
854 REGEX_CHECK_STATUS;
855 status = U_ZERO_ERROR;
856
857 m1->reset(len+1, status);
858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859 status = U_ZERO_ERROR;
860
861 //
862 // match(pos, status)
863 //
864 m1->reset(instr2);
865 REGEX_ASSERT(m1->matches(4, status) == TRUE);
866 m1->reset();
867 REGEX_ASSERT(m1->matches(3, status) == FALSE);
868 m1->reset();
869 REGEX_ASSERT(m1->matches(5, status) == FALSE);
870 REGEX_ASSERT(m1->matches(4, status) == TRUE);
871 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
872 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
873
874 // Match() at end of string should fail, but should not
875 // be an error.
876 status = U_ZERO_ERROR;
877 len = m1->input().length();
878 REGEX_ASSERT(m1->matches(len, status) == FALSE);
879 REGEX_CHECK_STATUS;
880
881 // Match beyond end of string should fail with an error.
882 status = U_ZERO_ERROR;
883 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885
886 // Successful match at end of string.
887 {
888 status = U_ZERO_ERROR;
889 RegexMatcher m("A?", 0, status); // will match zero length string.
890 REGEX_CHECK_STATUS;
891 m.reset(inStr1);
892 len = inStr1.length();
893 REGEX_ASSERT(m.matches(len, status) == TRUE);
894 REGEX_CHECK_STATUS;
895 m.reset(empty);
896 REGEX_ASSERT(m.matches(0, status) == TRUE);
897 REGEX_CHECK_STATUS;
898 }
899
900
901 //
902 // lookingAt(pos, status)
903 //
904 status = U_ZERO_ERROR;
905 m1->reset(instr2); // "not abc"
906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
908 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
909 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
910 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
911 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
912 status = U_ZERO_ERROR;
913 len = m1->input().length();
914 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
915 REGEX_CHECK_STATUS;
916 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
917 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
918
919 delete m1;
920 delete pat2;
921 }
922
923
924 //
925 // Capture Group.
926 // RegexMatcher::start();
927 // RegexMatcher::end();
928 // RegexMatcher::groupCount();
929 //
930 {
931 int32_t flags=0;
932 UParseError pe;
933 UErrorCode status=U_ZERO_ERROR;
934
935 UnicodeString re("01(23(45)67)(.*)");
936 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937 REGEX_CHECK_STATUS;
938 UnicodeString data = "0123456789";
939
940 RegexMatcher *matcher = pat->matcher(data, status);
941 REGEX_CHECK_STATUS;
942 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
943 static const int32_t matchStarts[] = {0, 2, 4, 8};
944 static const int32_t matchEnds[] = {10, 8, 6, 10};
945 int32_t i;
946 for (i=0; i<4; i++) {
947 int32_t actualStart = matcher->start(i, status);
948 REGEX_CHECK_STATUS;
949 if (actualStart != matchStarts[i]) {
950 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
951 __LINE__, i, matchStarts[i], actualStart);
952 }
953 int32_t actualEnd = matcher->end(i, status);
954 REGEX_CHECK_STATUS;
955 if (actualEnd != matchEnds[i]) {
956 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
957 __LINE__, i, matchEnds[i], actualEnd);
958 }
959 }
960
961 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
962 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
963
964 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
965 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
966 matcher->reset();
967 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
968
969 matcher->lookingAt(status);
970 REGEX_ASSERT(matcher->group(status) == "0123456789");
971 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
972 REGEX_ASSERT(matcher->group(1, status) == "234567" );
973 REGEX_ASSERT(matcher->group(2, status) == "45" );
974 REGEX_ASSERT(matcher->group(3, status) == "89" );
975 REGEX_CHECK_STATUS;
976 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
977 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
978 matcher->reset();
979 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
980
981 delete matcher;
982 delete pat;
983
984 }
985
986 //
987 // find
988 //
989 {
990 int32_t flags=0;
991 UParseError pe;
992 UErrorCode status=U_ZERO_ERROR;
993
994 UnicodeString re("abc");
995 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
996 REGEX_CHECK_STATUS;
997 UnicodeString data = ".abc..abc...abc..";
998 // 012345678901234567
999
1000 RegexMatcher *matcher = pat->matcher(data, status);
1001 REGEX_CHECK_STATUS;
1002 REGEX_ASSERT(matcher->find());
1003 REGEX_ASSERT(matcher->start(status) == 1);
1004 REGEX_ASSERT(matcher->find());
1005 REGEX_ASSERT(matcher->start(status) == 6);
1006 REGEX_ASSERT(matcher->find());
1007 REGEX_ASSERT(matcher->start(status) == 12);
1008 REGEX_ASSERT(matcher->find() == FALSE);
1009 REGEX_ASSERT(matcher->find() == FALSE);
1010
1011 matcher->reset();
1012 REGEX_ASSERT(matcher->find());
1013 REGEX_ASSERT(matcher->start(status) == 1);
1014
1015 REGEX_ASSERT(matcher->find(0, status));
1016 REGEX_ASSERT(matcher->start(status) == 1);
1017 REGEX_ASSERT(matcher->find(1, status));
1018 REGEX_ASSERT(matcher->start(status) == 1);
1019 REGEX_ASSERT(matcher->find(2, status));
1020 REGEX_ASSERT(matcher->start(status) == 6);
1021 REGEX_ASSERT(matcher->find(12, status));
1022 REGEX_ASSERT(matcher->start(status) == 12);
1023 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1024 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1025 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1026 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1027
1028 status = U_ZERO_ERROR;
1029 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1030 status = U_ZERO_ERROR;
1031 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032
1033 REGEX_ASSERT(matcher->groupCount() == 0);
1034
1035 delete matcher;
1036 delete pat;
1037 }
1038
1039
1040 //
1041 // find, with \G in pattern (true if at the end of a previous match).
1042 //
1043 {
1044 int32_t flags=0;
1045 UParseError pe;
1046 UErrorCode status=U_ZERO_ERROR;
1047
1048 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1049 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1050 REGEX_CHECK_STATUS;
1051 UnicodeString data = ".abcabc.abc..";
1052 // 012345678901234567
1053
1054 RegexMatcher *matcher = pat->matcher(data, status);
1055 REGEX_CHECK_STATUS;
1056 REGEX_ASSERT(matcher->find());
1057 REGEX_ASSERT(matcher->start(status) == 0);
1058 REGEX_ASSERT(matcher->start(1, status) == -1);
1059 REGEX_ASSERT(matcher->start(2, status) == 1);
1060
1061 REGEX_ASSERT(matcher->find());
1062 REGEX_ASSERT(matcher->start(status) == 4);
1063 REGEX_ASSERT(matcher->start(1, status) == 4);
1064 REGEX_ASSERT(matcher->start(2, status) == -1);
1065 REGEX_CHECK_STATUS;
1066
1067 delete matcher;
1068 delete pat;
1069 }
1070
1071 //
1072 // find with zero length matches, match position should bump ahead
1073 // to prevent loops.
1074 //
1075 {
1076 int32_t i;
1077 UErrorCode status=U_ZERO_ERROR;
1078 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1079 // using an always-true look-ahead.
1080 REGEX_CHECK_STATUS;
1081 UnicodeString s(" ");
1082 m.reset(s);
1083 for (i=0; ; i++) {
1084 if (m.find() == FALSE) {
1085 break;
1086 }
1087 REGEX_ASSERT(m.start(status) == i);
1088 REGEX_ASSERT(m.end(status) == i);
1089 }
1090 REGEX_ASSERT(i==5);
1091
1092 // Check that the bump goes over surrogate pairs OK
1093 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094 s = s.unescape();
1095 m.reset(s);
1096 for (i=0; ; i+=2) {
1097 if (m.find() == FALSE) {
1098 break;
1099 }
1100 REGEX_ASSERT(m.start(status) == i);
1101 REGEX_ASSERT(m.end(status) == i);
1102 }
1103 REGEX_ASSERT(i==10);
1104 }
1105 {
1106 // find() loop breaking test.
1107 // with pattern of /.?/, should see a series of one char matches, then a single
1108 // match of zero length at the end of the input string.
1109 int32_t i;
1110 UErrorCode status=U_ZERO_ERROR;
1111 RegexMatcher m(".?", 0, status);
1112 REGEX_CHECK_STATUS;
1113 UnicodeString s(" ");
1114 m.reset(s);
1115 for (i=0; ; i++) {
1116 if (m.find() == FALSE) {
1117 break;
1118 }
1119 REGEX_ASSERT(m.start(status) == i);
1120 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1121 }
1122 REGEX_ASSERT(i==5);
1123 }
1124
1125
1126 //
1127 // Matchers with no input string behave as if they had an empty input string.
1128 //
1129
1130 {
1131 UErrorCode status = U_ZERO_ERROR;
1132 RegexMatcher m(".?", 0, status);
1133 REGEX_CHECK_STATUS;
1134 REGEX_ASSERT(m.find());
1135 REGEX_ASSERT(m.start(status) == 0);
1136 REGEX_ASSERT(m.input() == "");
1137 }
1138 {
1139 UErrorCode status = U_ZERO_ERROR;
1140 RegexPattern *p = RegexPattern::compile(".", 0, status);
1141 RegexMatcher *m = p->matcher(status);
1142 REGEX_CHECK_STATUS;
1143
1144 REGEX_ASSERT(m->find() == FALSE);
1145 REGEX_ASSERT(m->input() == "");
1146 delete m;
1147 delete p;
1148 }
1149
1150 //
1151 // Regions
1152 //
1153 {
1154 UErrorCode status = U_ZERO_ERROR;
1155 UnicodeString testString("This is test data");
1156 RegexMatcher m(".*", testString, 0, status);
1157 REGEX_CHECK_STATUS;
1158 REGEX_ASSERT(m.regionStart() == 0);
1159 REGEX_ASSERT(m.regionEnd() == testString.length());
1160 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163 m.region(2,4, status);
1164 REGEX_CHECK_STATUS;
1165 REGEX_ASSERT(m.matches(status));
1166 REGEX_ASSERT(m.start(status)==2);
1167 REGEX_ASSERT(m.end(status)==4);
1168 REGEX_CHECK_STATUS;
1169
1170 m.reset();
1171 REGEX_ASSERT(m.regionStart() == 0);
1172 REGEX_ASSERT(m.regionEnd() == testString.length());
1173
1174 UnicodeString shorterString("short");
1175 m.reset(shorterString);
1176 REGEX_ASSERT(m.regionStart() == 0);
1177 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1178
1179 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1180 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1181 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1182 REGEX_ASSERT(&m == &m.reset());
1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184
1185 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1186 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1187 REGEX_ASSERT(&m == &m.reset());
1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189
1190 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1191 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1192 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1193 REGEX_ASSERT(&m == &m.reset());
1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195
1196 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1197 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1198 REGEX_ASSERT(&m == &m.reset());
1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200
1201 }
1202
1203 //
1204 // hitEnd() and requireEnd()
1205 //
1206 {
1207 UErrorCode status = U_ZERO_ERROR;
1208 UnicodeString testString("aabb");
1209 RegexMatcher m1(".*", testString, 0, status);
1210 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1211 REGEX_ASSERT(m1.hitEnd() == TRUE);
1212 REGEX_ASSERT(m1.requireEnd() == FALSE);
1213 REGEX_CHECK_STATUS;
1214
1215 status = U_ZERO_ERROR;
1216 RegexMatcher m2("a*", testString, 0, status);
1217 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1218 REGEX_ASSERT(m2.hitEnd() == FALSE);
1219 REGEX_ASSERT(m2.requireEnd() == FALSE);
1220 REGEX_CHECK_STATUS;
1221
1222 status = U_ZERO_ERROR;
1223 RegexMatcher m3(".*$", testString, 0, status);
1224 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1225 REGEX_ASSERT(m3.hitEnd() == TRUE);
1226 REGEX_ASSERT(m3.requireEnd() == TRUE);
1227 REGEX_CHECK_STATUS;
1228 }
1229
1230
1231 //
1232 // Compilation error on reset with UChar *
1233 // These were a hazard that people were stumbling over with runtime errors.
1234 // Changed them to compiler errors by adding private methods that more closely
1235 // matched the incorrect use of the functions.
1236 //
1237 #if 0
1238 {
1239 UErrorCode status = U_ZERO_ERROR;
1240 UChar ucharString[20];
1241 RegexMatcher m(".", 0, status);
1242 m.reset(ucharString); // should not compile.
1243
1244 RegexPattern *p = RegexPattern::compile(".", 0, status);
1245 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1246
1247 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1248 }
1249 #endif
1250
1251 //
1252 // Time Outs.
1253 // Note: These tests will need to be changed when the regexp engine is
1254 // able to detect and cut short the exponential time behavior on
1255 // this type of match.
1256 //
1257 {
1258 UErrorCode status = U_ZERO_ERROR;
1259 // Enough 'a's in the string to cause the match to time out.
1260 // (Each on additonal 'a' doubles the time)
1261 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1262 RegexMatcher matcher("(a+)+b", testString, 0, status);
1263 REGEX_CHECK_STATUS;
1264 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1265 matcher.setTimeLimit(100, status);
1266 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1267 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1269 }
1270 {
1271 UErrorCode status = U_ZERO_ERROR;
1272 // Few enough 'a's to slip in under the time limit.
1273 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1274 RegexMatcher matcher("(a+)+b", testString, 0, status);
1275 REGEX_CHECK_STATUS;
1276 matcher.setTimeLimit(100, status);
1277 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1278 REGEX_CHECK_STATUS;
1279 }
1280
1281 //
1282 // Stack Limits
1283 //
1284 {
1285 UErrorCode status = U_ZERO_ERROR;
1286 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1287
1288 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289 // of the '+', and makes the stack frames larger.
1290 RegexMatcher matcher("(A)+A$", testString, 0, status);
1291
1292 // With the default stack, this match should fail to run
1293 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1294 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1295
1296 // With unlimited stack, it should run
1297 status = U_ZERO_ERROR;
1298 matcher.setStackLimit(0, status);
1299 REGEX_CHECK_STATUS;
1300 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1301 REGEX_CHECK_STATUS;
1302 REGEX_ASSERT(matcher.getStackLimit() == 0);
1303
1304 // With a limited stack, it the match should fail
1305 status = U_ZERO_ERROR;
1306 matcher.setStackLimit(10000, status);
1307 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1308 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1309 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1310 }
1311
1312 // A pattern that doesn't save state should work with
1313 // a minimal sized stack
1314 {
1315 UErrorCode status = U_ZERO_ERROR;
1316 UnicodeString testString = "abc";
1317 RegexMatcher matcher("abc", testString, 0, status);
1318 REGEX_CHECK_STATUS;
1319 matcher.setStackLimit(30, status);
1320 REGEX_CHECK_STATUS;
1321 REGEX_ASSERT(matcher.matches(status) == TRUE);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(matcher.getStackLimit() == 30);
1324
1325 // Negative stack sizes should fail
1326 status = U_ZERO_ERROR;
1327 matcher.setStackLimit(1000, status);
1328 REGEX_CHECK_STATUS;
1329 matcher.setStackLimit(-1, status);
1330 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1331 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1332 }
1333
1334
1335 }
1336
1337
1338
1339
1340
1341
1342 //---------------------------------------------------------------------------
1343 //
1344 // API_Replace API test for class RegexMatcher, testing the
1345 // Replace family of functions.
1346 //
1347 //---------------------------------------------------------------------------
1348 void RegexTest::API_Replace() {
1349 //
1350 // Replace
1351 //
1352 int32_t flags=0;
1353 UParseError pe;
1354 UErrorCode status=U_ZERO_ERROR;
1355
1356 UnicodeString re("abc");
1357 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1358 REGEX_CHECK_STATUS;
1359 UnicodeString data = ".abc..abc...abc..";
1360 // 012345678901234567
1361 RegexMatcher *matcher = pat->matcher(data, status);
1362
1363 //
1364 // Plain vanilla matches.
1365 //
1366 UnicodeString dest;
1367 dest = matcher->replaceFirst("yz", status);
1368 REGEX_CHECK_STATUS;
1369 REGEX_ASSERT(dest == ".yz..abc...abc..");
1370
1371 dest = matcher->replaceAll("yz", status);
1372 REGEX_CHECK_STATUS;
1373 REGEX_ASSERT(dest == ".yz..yz...yz..");
1374
1375 //
1376 // Plain vanilla non-matches.
1377 //
1378 UnicodeString d2 = ".abx..abx...abx..";
1379 matcher->reset(d2);
1380 dest = matcher->replaceFirst("yz", status);
1381 REGEX_CHECK_STATUS;
1382 REGEX_ASSERT(dest == ".abx..abx...abx..");
1383
1384 dest = matcher->replaceAll("yz", status);
1385 REGEX_CHECK_STATUS;
1386 REGEX_ASSERT(dest == ".abx..abx...abx..");
1387
1388 //
1389 // Empty source string
1390 //
1391 UnicodeString d3 = "";
1392 matcher->reset(d3);
1393 dest = matcher->replaceFirst("yz", status);
1394 REGEX_CHECK_STATUS;
1395 REGEX_ASSERT(dest == "");
1396
1397 dest = matcher->replaceAll("yz", status);
1398 REGEX_CHECK_STATUS;
1399 REGEX_ASSERT(dest == "");
1400
1401 //
1402 // Empty substitution string
1403 //
1404 matcher->reset(data); // ".abc..abc...abc.."
1405 dest = matcher->replaceFirst("", status);
1406 REGEX_CHECK_STATUS;
1407 REGEX_ASSERT(dest == "...abc...abc..");
1408
1409 dest = matcher->replaceAll("", status);
1410 REGEX_CHECK_STATUS;
1411 REGEX_ASSERT(dest == "........");
1412
1413 //
1414 // match whole string
1415 //
1416 UnicodeString d4 = "abc";
1417 matcher->reset(d4);
1418 dest = matcher->replaceFirst("xyz", status);
1419 REGEX_CHECK_STATUS;
1420 REGEX_ASSERT(dest == "xyz");
1421
1422 dest = matcher->replaceAll("xyz", status);
1423 REGEX_CHECK_STATUS;
1424 REGEX_ASSERT(dest == "xyz");
1425
1426 //
1427 // Capture Group, simple case
1428 //
1429 UnicodeString re2("a(..)");
1430 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1431 REGEX_CHECK_STATUS;
1432 UnicodeString d5 = "abcdefg";
1433 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1434 REGEX_CHECK_STATUS;
1435 dest = matcher2->replaceFirst("$1$1", status);
1436 REGEX_CHECK_STATUS;
1437 REGEX_ASSERT(dest == "bcbcdefg");
1438
1439 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1440 REGEX_CHECK_STATUS;
1441 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1442
1443 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1444 REGEX_ASSERT(U_FAILURE(status));
1445 status = U_ZERO_ERROR;
1446
1447 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448 replacement = replacement.unescape();
1449 dest = matcher2->replaceFirst(replacement, status);
1450 REGEX_CHECK_STATUS;
1451 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1452
1453 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1454
1455
1456 //
1457 // Replacement String with \u hex escapes
1458 //
1459 {
1460 UnicodeString src = "abc 1 abc 2 abc 3";
1461 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1462 matcher->reset(src);
1463 UnicodeString result = matcher->replaceAll(substitute, status);
1464 REGEX_CHECK_STATUS;
1465 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1466 }
1467 {
1468 UnicodeString src = "abc !";
1469 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1470 matcher->reset(src);
1471 UnicodeString result = matcher->replaceAll(substitute, status);
1472 REGEX_CHECK_STATUS;
1473 UnicodeString expected = UnicodeString("--");
1474 expected.append((UChar32)0x10000);
1475 expected.append("-- !");
1476 REGEX_ASSERT(result == expected);
1477 }
1478 // TODO: need more through testing of capture substitutions.
1479
1480 // Bug 4057
1481 //
1482 {
1483 status = U_ZERO_ERROR;
1484 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1485 RegexMatcher m("ss(.*?)ee", 0, status);
1486 REGEX_CHECK_STATUS;
1487 UnicodeString result;
1488
1489 // Multiple finds do NOT bump up the previous appendReplacement postion.
1490 m.reset(s);
1491 m.find();
1492 m.find();
1493 m.appendReplacement(result, "ooh", status);
1494 REGEX_CHECK_STATUS;
1495 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1496
1497 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498 status = U_ZERO_ERROR;
1499 result.truncate(0);
1500 m.reset(10, status);
1501 m.find();
1502 m.find();
1503 m.appendReplacement(result, "ooh", status);
1504 REGEX_CHECK_STATUS;
1505 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1506
1507 // find() at interior of string, appendReplacemnt still starts at beginning.
1508 status = U_ZERO_ERROR;
1509 result.truncate(0);
1510 m.reset();
1511 m.find(10, status);
1512 m.find();
1513 m.appendReplacement(result, "ooh", status);
1514 REGEX_CHECK_STATUS;
1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1516
1517 m.appendTail(result);
1518 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1519
1520 }
1521
1522 delete matcher2;
1523 delete pat2;
1524 delete matcher;
1525 delete pat;
1526 }
1527
1528
1529 //---------------------------------------------------------------------------
1530 //
1531 // API_Pattern Test that the API for class RegexPattern is
1532 // present and nominally working.
1533 //
1534 //---------------------------------------------------------------------------
1535 void RegexTest::API_Pattern() {
1536 RegexPattern pata; // Test default constructor to not crash.
1537 RegexPattern patb;
1538
1539 REGEX_ASSERT(pata == patb);
1540 REGEX_ASSERT(pata == pata);
1541
1542 UnicodeString re1("abc[a-l][m-z]");
1543 UnicodeString re2("def");
1544 UErrorCode status = U_ZERO_ERROR;
1545 UParseError pe;
1546
1547 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1548 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1549 REGEX_CHECK_STATUS;
1550 REGEX_ASSERT(*pat1 == *pat1);
1551 REGEX_ASSERT(*pat1 != pata);
1552
1553 // Assign
1554 patb = *pat1;
1555 REGEX_ASSERT(patb == *pat1);
1556
1557 // Copy Construct
1558 RegexPattern patc(*pat1);
1559 REGEX_ASSERT(patc == *pat1);
1560 REGEX_ASSERT(patb == patc);
1561 REGEX_ASSERT(pat1 != pat2);
1562 patb = *pat2;
1563 REGEX_ASSERT(patb != patc);
1564 REGEX_ASSERT(patb == *pat2);
1565
1566 // Compile with no flags.
1567 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1568 REGEX_ASSERT(*pat1a == *pat1);
1569
1570 REGEX_ASSERT(pat1a->flags() == 0);
1571
1572 // Compile with different flags should be not equal
1573 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1574 REGEX_CHECK_STATUS;
1575
1576 REGEX_ASSERT(*pat1b != *pat1a);
1577 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1578 REGEX_ASSERT(pat1a->flags() == 0);
1579 delete pat1b;
1580
1581 // clone
1582 RegexPattern *pat1c = pat1->clone();
1583 REGEX_ASSERT(*pat1c == *pat1);
1584 REGEX_ASSERT(*pat1c != *pat2);
1585
1586 delete pat1c;
1587 delete pat1a;
1588 delete pat1;
1589 delete pat2;
1590
1591
1592 //
1593 // Verify that a matcher created from a cloned pattern works.
1594 // (Jitterbug 3423)
1595 //
1596 {
1597 UErrorCode status = U_ZERO_ERROR;
1598 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1599 RegexPattern *pClone = pSource->clone();
1600 delete pSource;
1601 RegexMatcher *mFromClone = pClone->matcher(status);
1602 REGEX_CHECK_STATUS;
1603 UnicodeString s = "Hello World";
1604 mFromClone->reset(s);
1605 REGEX_ASSERT(mFromClone->find() == TRUE);
1606 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1607 REGEX_ASSERT(mFromClone->find() == TRUE);
1608 REGEX_ASSERT(mFromClone->group(status) == "World");
1609 REGEX_ASSERT(mFromClone->find() == FALSE);
1610 delete mFromClone;
1611 delete pClone;
1612 }
1613
1614 //
1615 // matches convenience API
1616 //
1617 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1618 REGEX_CHECK_STATUS;
1619 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1620 REGEX_CHECK_STATUS;
1621 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1626 REGEX_CHECK_STATUS;
1627 status = U_INDEX_OUTOFBOUNDS_ERROR;
1628 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1629 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1630
1631
1632 //
1633 // Split()
1634 //
1635 status = U_ZERO_ERROR;
1636 pat1 = RegexPattern::compile(" +", pe, status);
1637 REGEX_CHECK_STATUS;
1638 UnicodeString fields[10];
1639
1640 int32_t n;
1641 n = pat1->split("Now is the time", fields, 10, status);
1642 REGEX_CHECK_STATUS;
1643 REGEX_ASSERT(n==4);
1644 REGEX_ASSERT(fields[0]=="Now");
1645 REGEX_ASSERT(fields[1]=="is");
1646 REGEX_ASSERT(fields[2]=="the");
1647 REGEX_ASSERT(fields[3]=="time");
1648 REGEX_ASSERT(fields[4]=="");
1649
1650 n = pat1->split("Now is the time", fields, 2, status);
1651 REGEX_CHECK_STATUS;
1652 REGEX_ASSERT(n==2);
1653 REGEX_ASSERT(fields[0]=="Now");
1654 REGEX_ASSERT(fields[1]=="is the time");
1655 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1656
1657 fields[1] = "*";
1658 status = U_ZERO_ERROR;
1659 n = pat1->split("Now is the time", fields, 1, status);
1660 REGEX_CHECK_STATUS;
1661 REGEX_ASSERT(n==1);
1662 REGEX_ASSERT(fields[0]=="Now is the time");
1663 REGEX_ASSERT(fields[1]=="*");
1664 status = U_ZERO_ERROR;
1665
1666 n = pat1->split(" Now is the time ", fields, 10, status);
1667 REGEX_CHECK_STATUS;
1668 REGEX_ASSERT(n==6);
1669 REGEX_ASSERT(fields[0]=="");
1670 REGEX_ASSERT(fields[1]=="Now");
1671 REGEX_ASSERT(fields[2]=="is");
1672 REGEX_ASSERT(fields[3]=="the");
1673 REGEX_ASSERT(fields[4]=="time");
1674 REGEX_ASSERT(fields[5]=="");
1675
1676 n = pat1->split(" ", fields, 10, status);
1677 REGEX_CHECK_STATUS;
1678 REGEX_ASSERT(n==2);
1679 REGEX_ASSERT(fields[0]=="");
1680 REGEX_ASSERT(fields[1]=="");
1681
1682 fields[0] = "foo";
1683 n = pat1->split("", fields, 10, status);
1684 REGEX_CHECK_STATUS;
1685 REGEX_ASSERT(n==0);
1686 REGEX_ASSERT(fields[0]=="foo");
1687
1688 delete pat1;
1689
1690 // split, with a pattern with (capture)
1691 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1692 REGEX_CHECK_STATUS;
1693
1694 status = U_ZERO_ERROR;
1695 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1696 REGEX_CHECK_STATUS;
1697 REGEX_ASSERT(n==7);
1698 REGEX_ASSERT(fields[0]=="");
1699 REGEX_ASSERT(fields[1]=="a");
1700 REGEX_ASSERT(fields[2]=="Now is ");
1701 REGEX_ASSERT(fields[3]=="b");
1702 REGEX_ASSERT(fields[4]=="the time");
1703 REGEX_ASSERT(fields[5]=="c");
1704 REGEX_ASSERT(fields[6]=="");
1705 REGEX_ASSERT(status==U_ZERO_ERROR);
1706
1707 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1708 REGEX_CHECK_STATUS;
1709 REGEX_ASSERT(n==7);
1710 REGEX_ASSERT(fields[0]==" ");
1711 REGEX_ASSERT(fields[1]=="a");
1712 REGEX_ASSERT(fields[2]=="Now is ");
1713 REGEX_ASSERT(fields[3]=="b");
1714 REGEX_ASSERT(fields[4]=="the time");
1715 REGEX_ASSERT(fields[5]=="c");
1716 REGEX_ASSERT(fields[6]=="");
1717
1718 status = U_ZERO_ERROR;
1719 fields[6] = "foo";
1720 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1721 REGEX_CHECK_STATUS;
1722 REGEX_ASSERT(n==6);
1723 REGEX_ASSERT(fields[0]==" ");
1724 REGEX_ASSERT(fields[1]=="a");
1725 REGEX_ASSERT(fields[2]=="Now is ");
1726 REGEX_ASSERT(fields[3]=="b");
1727 REGEX_ASSERT(fields[4]=="the time");
1728 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1729 REGEX_ASSERT(fields[6]=="foo");
1730
1731 status = U_ZERO_ERROR;
1732 fields[5] = "foo";
1733 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1734 REGEX_CHECK_STATUS;
1735 REGEX_ASSERT(n==5);
1736 REGEX_ASSERT(fields[0]==" ");
1737 REGEX_ASSERT(fields[1]=="a");
1738 REGEX_ASSERT(fields[2]=="Now is ");
1739 REGEX_ASSERT(fields[3]=="b");
1740 REGEX_ASSERT(fields[4]=="the time<c>");
1741 REGEX_ASSERT(fields[5]=="foo");
1742
1743 status = U_ZERO_ERROR;
1744 fields[5] = "foo";
1745 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1746 REGEX_CHECK_STATUS;
1747 REGEX_ASSERT(n==5);
1748 REGEX_ASSERT(fields[0]==" ");
1749 REGEX_ASSERT(fields[1]=="a");
1750 REGEX_ASSERT(fields[2]=="Now is ");
1751 REGEX_ASSERT(fields[3]=="b");
1752 REGEX_ASSERT(fields[4]=="the time");
1753 REGEX_ASSERT(fields[5]=="foo");
1754
1755 status = U_ZERO_ERROR;
1756 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1757 REGEX_CHECK_STATUS;
1758 REGEX_ASSERT(n==4);
1759 REGEX_ASSERT(fields[0]==" ");
1760 REGEX_ASSERT(fields[1]=="a");
1761 REGEX_ASSERT(fields[2]=="Now is ");
1762 REGEX_ASSERT(fields[3]=="the time<c>");
1763 status = U_ZERO_ERROR;
1764 delete pat1;
1765
1766 pat1 = RegexPattern::compile("([-,])", pe, status);
1767 REGEX_CHECK_STATUS;
1768 n = pat1->split("1-10,20", fields, 10, status);
1769 REGEX_CHECK_STATUS;
1770 REGEX_ASSERT(n==5);
1771 REGEX_ASSERT(fields[0]=="1");
1772 REGEX_ASSERT(fields[1]=="-");
1773 REGEX_ASSERT(fields[2]=="10");
1774 REGEX_ASSERT(fields[3]==",");
1775 REGEX_ASSERT(fields[4]=="20");
1776 delete pat1;
1777
1778 // Test split of string with empty trailing fields
1779 pat1 = RegexPattern::compile(",", pe, status);
1780 REGEX_CHECK_STATUS;
1781 n = pat1->split("a,b,c,", fields, 10, status);
1782 REGEX_CHECK_STATUS;
1783 REGEX_ASSERT(n==4);
1784 REGEX_ASSERT(fields[0]=="a");
1785 REGEX_ASSERT(fields[1]=="b");
1786 REGEX_ASSERT(fields[2]=="c");
1787 REGEX_ASSERT(fields[3]=="");
1788
1789 n = pat1->split("a,,,", fields, 10, status);
1790 REGEX_CHECK_STATUS;
1791 REGEX_ASSERT(n==4);
1792 REGEX_ASSERT(fields[0]=="a");
1793 REGEX_ASSERT(fields[1]=="");
1794 REGEX_ASSERT(fields[2]=="");
1795 REGEX_ASSERT(fields[3]=="");
1796 delete pat1;
1797
1798 // Split Separator with zero length match.
1799 pat1 = RegexPattern::compile(":?", pe, status);
1800 REGEX_CHECK_STATUS;
1801 n = pat1->split("abc", fields, 10, status);
1802 REGEX_CHECK_STATUS;
1803 REGEX_ASSERT(n==5);
1804 REGEX_ASSERT(fields[0]=="");
1805 REGEX_ASSERT(fields[1]=="a");
1806 REGEX_ASSERT(fields[2]=="b");
1807 REGEX_ASSERT(fields[3]=="c");
1808 REGEX_ASSERT(fields[4]=="");
1809
1810 delete pat1;
1811
1812 //
1813 // RegexPattern::pattern()
1814 //
1815 pat1 = new RegexPattern();
1816 REGEX_ASSERT(pat1->pattern() == "");
1817 delete pat1;
1818
1819 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1820 REGEX_CHECK_STATUS;
1821 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1822 delete pat1;
1823
1824
1825 //
1826 // classID functions
1827 //
1828 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1829 REGEX_CHECK_STATUS;
1830 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1831 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1832 UnicodeString Hello("Hello, world.");
1833 RegexMatcher *m = pat1->matcher(Hello, status);
1834 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1835 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1837 delete m;
1838 delete pat1;
1839
1840 }
1841
1842 //---------------------------------------------------------------------------
1843 //
1844 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1845 // is present and working, but excluding functions
1846 // implementing replace operations.
1847 //
1848 //---------------------------------------------------------------------------
1849 void RegexTest::API_Match_UTF8() {
1850 UParseError pe;
1851 UErrorCode status=U_ZERO_ERROR;
1852 int32_t flags = 0;
1853
1854 //
1855 // Debug - slide failing test cases early
1856 //
1857 #if 0
1858 {
1859 }
1860 return;
1861 #endif
1862
1863 //
1864 // Simple pattern compilation
1865 //
1866 {
1867 UText re = UTEXT_INITIALIZER;
1868 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1869 REGEX_VERBOSE_TEXT(&re);
1870 RegexPattern *pat2;
1871 pat2 = RegexPattern::compile(&re, flags, pe, status);
1872 REGEX_CHECK_STATUS;
1873
1874 UText input1 = UTEXT_INITIALIZER;
1875 UText input2 = UTEXT_INITIALIZER;
1876 UText empty = UTEXT_INITIALIZER;
1877 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1878 REGEX_VERBOSE_TEXT(&input1);
1879 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1880 REGEX_VERBOSE_TEXT(&input2);
1881 utext_openUChars(&empty, NULL, 0, &status);
1882
1883 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884 int32_t input2Len = strlen("not abc");
1885
1886
1887 //
1888 // Matcher creation and reset.
1889 //
1890 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1891 REGEX_CHECK_STATUS;
1892 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1893 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1895 m1->reset(&input2);
1896 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1897 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1899 m1->reset(&input1);
1900 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1901 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1902 m1->reset(&empty);
1903 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1904 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1905
1906 //
1907 // reset(pos, status)
1908 //
1909 m1->reset(&input1);
1910 m1->reset(4, status);
1911 REGEX_CHECK_STATUS;
1912 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1913 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1914
1915 m1->reset(-1, status);
1916 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1917 status = U_ZERO_ERROR;
1918
1919 m1->reset(0, status);
1920 REGEX_CHECK_STATUS;
1921 status = U_ZERO_ERROR;
1922
1923 m1->reset(input1Len-1, status);
1924 REGEX_CHECK_STATUS;
1925 status = U_ZERO_ERROR;
1926
1927 m1->reset(input1Len, status);
1928 REGEX_CHECK_STATUS;
1929 status = U_ZERO_ERROR;
1930
1931 m1->reset(input1Len+1, status);
1932 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1933 status = U_ZERO_ERROR;
1934
1935 //
1936 // match(pos, status)
1937 //
1938 m1->reset(&input2);
1939 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1940 m1->reset();
1941 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1942 m1->reset();
1943 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1944 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1945 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1946 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1947
1948 // Match() at end of string should fail, but should not
1949 // be an error.
1950 status = U_ZERO_ERROR;
1951 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1952 REGEX_CHECK_STATUS;
1953
1954 // Match beyond end of string should fail with an error.
1955 status = U_ZERO_ERROR;
1956 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1957 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958
1959 // Successful match at end of string.
1960 {
1961 status = U_ZERO_ERROR;
1962 RegexMatcher m("A?", 0, status); // will match zero length string.
1963 REGEX_CHECK_STATUS;
1964 m.reset(&input1);
1965 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1966 REGEX_CHECK_STATUS;
1967 m.reset(&empty);
1968 REGEX_ASSERT(m.matches(0, status) == TRUE);
1969 REGEX_CHECK_STATUS;
1970 }
1971
1972
1973 //
1974 // lookingAt(pos, status)
1975 //
1976 status = U_ZERO_ERROR;
1977 m1->reset(&input2); // "not abc"
1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1980 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1981 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1982 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1983 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1984 status = U_ZERO_ERROR;
1985 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1986 REGEX_CHECK_STATUS;
1987 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1988 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1989
1990 delete m1;
1991 delete pat2;
1992
1993 utext_close(&re);
1994 utext_close(&input1);
1995 utext_close(&input2);
1996 utext_close(&empty);
1997 }
1998
1999
2000 //
2001 // Capture Group.
2002 // RegexMatcher::start();
2003 // RegexMatcher::end();
2004 // RegexMatcher::groupCount();
2005 //
2006 {
2007 int32_t flags=0;
2008 UParseError pe;
2009 UErrorCode status=U_ZERO_ERROR;
2010 UText re=UTEXT_INITIALIZER;
2011 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012 utext_openUTF8(&re, str_01234567_pat, -1, &status);
2013
2014 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2015 REGEX_CHECK_STATUS;
2016
2017 UText input = UTEXT_INITIALIZER;
2018 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019 utext_openUTF8(&input, str_0123456789, -1, &status);
2020
2021 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2022 REGEX_CHECK_STATUS;
2023 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2024 static const int32_t matchStarts[] = {0, 2, 4, 8};
2025 static const int32_t matchEnds[] = {10, 8, 6, 10};
2026 int32_t i;
2027 for (i=0; i<4; i++) {
2028 int32_t actualStart = matcher->start(i, status);
2029 REGEX_CHECK_STATUS;
2030 if (actualStart != matchStarts[i]) {
2031 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2032 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2033 }
2034 int32_t actualEnd = matcher->end(i, status);
2035 REGEX_CHECK_STATUS;
2036 if (actualEnd != matchEnds[i]) {
2037 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2038 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2039 }
2040 }
2041
2042 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2043 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2044
2045 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2046 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2047 matcher->reset();
2048 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2049
2050 matcher->lookingAt(status);
2051
2052 UnicodeString dest;
2053 UText destText = UTEXT_INITIALIZER;
2054 utext_openUnicodeString(&destText, &dest, &status);
2055 UText *result;
2056 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057 // Test shallow-clone API
2058 int64_t group_len;
2059 result = matcher->group((UText *)NULL, group_len, status);
2060 REGEX_CHECK_STATUS;
2061 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2062 utext_close(result);
2063 result = matcher->group(0, &destText, group_len, status);
2064 REGEX_CHECK_STATUS;
2065 REGEX_ASSERT(result == &destText);
2066 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2067 // destText is now immutable, reopen it
2068 utext_close(&destText);
2069 utext_openUnicodeString(&destText, &dest, &status);
2070
2071 int64_t length;
2072 result = matcher->group(0, NULL, length, status);
2073 REGEX_CHECK_STATUS;
2074 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2075 utext_close(result);
2076 result = matcher->group(0, &destText, length, status);
2077 REGEX_CHECK_STATUS;
2078 REGEX_ASSERT(result == &destText);
2079 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080 REGEX_ASSERT(length == 10);
2081 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2082
2083 // Capture Group 1 == "234567"
2084 result = matcher->group(1, NULL, length, status);
2085 REGEX_CHECK_STATUS;
2086 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2087 REGEX_ASSERT(length == 6);
2088 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2089 utext_close(result);
2090
2091 result = matcher->group(1, &destText, length, status);
2092 REGEX_CHECK_STATUS;
2093 REGEX_ASSERT(result == &destText);
2094 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095 REGEX_ASSERT(length == 6);
2096 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097 utext_close(result);
2098
2099 // Capture Group 2 == "45"
2100 result = matcher->group(2, NULL, length, status);
2101 REGEX_CHECK_STATUS;
2102 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2103 REGEX_ASSERT(length == 2);
2104 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2105 utext_close(result);
2106
2107 result = matcher->group(2, &destText, length, status);
2108 REGEX_CHECK_STATUS;
2109 REGEX_ASSERT(result == &destText);
2110 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111 REGEX_ASSERT(length == 2);
2112 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113 utext_close(result);
2114
2115 // Capture Group 3 == "89"
2116 result = matcher->group(3, NULL, length, status);
2117 REGEX_CHECK_STATUS;
2118 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2119 REGEX_ASSERT(length == 2);
2120 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2121 utext_close(result);
2122
2123 result = matcher->group(3, &destText, length, status);
2124 REGEX_CHECK_STATUS;
2125 REGEX_ASSERT(result == &destText);
2126 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127 REGEX_ASSERT(length == 2);
2128 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129 utext_close(result);
2130
2131 // Capture Group number out of range.
2132 status = U_ZERO_ERROR;
2133 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136 status = U_ZERO_ERROR;
2137 matcher->reset();
2138 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2139
2140 delete matcher;
2141 delete pat;
2142
2143 utext_close(&destText);
2144 utext_close(&input);
2145 utext_close(&re);
2146 }
2147
2148 //
2149 // find
2150 //
2151 {
2152 int32_t flags=0;
2153 UParseError pe;
2154 UErrorCode status=U_ZERO_ERROR;
2155 UText re=UTEXT_INITIALIZER;
2156 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157 utext_openUTF8(&re, str_abc, -1, &status);
2158
2159 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2160 REGEX_CHECK_STATUS;
2161 UText input = UTEXT_INITIALIZER;
2162 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164 // 012345678901234567
2165
2166 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167 REGEX_CHECK_STATUS;
2168 REGEX_ASSERT(matcher->find());
2169 REGEX_ASSERT(matcher->start(status) == 1);
2170 REGEX_ASSERT(matcher->find());
2171 REGEX_ASSERT(matcher->start(status) == 6);
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 12);
2174 REGEX_ASSERT(matcher->find() == FALSE);
2175 REGEX_ASSERT(matcher->find() == FALSE);
2176
2177 matcher->reset();
2178 REGEX_ASSERT(matcher->find());
2179 REGEX_ASSERT(matcher->start(status) == 1);
2180
2181 REGEX_ASSERT(matcher->find(0, status));
2182 REGEX_ASSERT(matcher->start(status) == 1);
2183 REGEX_ASSERT(matcher->find(1, status));
2184 REGEX_ASSERT(matcher->start(status) == 1);
2185 REGEX_ASSERT(matcher->find(2, status));
2186 REGEX_ASSERT(matcher->start(status) == 6);
2187 REGEX_ASSERT(matcher->find(12, status));
2188 REGEX_ASSERT(matcher->start(status) == 12);
2189 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2190 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2191 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2192 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2193
2194 status = U_ZERO_ERROR;
2195 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2196 status = U_ZERO_ERROR;
2197 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198
2199 REGEX_ASSERT(matcher->groupCount() == 0);
2200
2201 delete matcher;
2202 delete pat;
2203
2204 utext_close(&input);
2205 utext_close(&re);
2206 }
2207
2208
2209 //
2210 // find, with \G in pattern (true if at the end of a previous match).
2211 //
2212 {
2213 int32_t flags=0;
2214 UParseError pe;
2215 UErrorCode status=U_ZERO_ERROR;
2216 UText re=UTEXT_INITIALIZER;
2217 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2219
2220 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2221
2222 REGEX_CHECK_STATUS;
2223 UText input = UTEXT_INITIALIZER;
2224 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2226 // 012345678901234567
2227
2228 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2229 REGEX_CHECK_STATUS;
2230 REGEX_ASSERT(matcher->find());
2231 REGEX_ASSERT(matcher->start(status) == 0);
2232 REGEX_ASSERT(matcher->start(1, status) == -1);
2233 REGEX_ASSERT(matcher->start(2, status) == 1);
2234
2235 REGEX_ASSERT(matcher->find());
2236 REGEX_ASSERT(matcher->start(status) == 4);
2237 REGEX_ASSERT(matcher->start(1, status) == 4);
2238 REGEX_ASSERT(matcher->start(2, status) == -1);
2239 REGEX_CHECK_STATUS;
2240
2241 delete matcher;
2242 delete pat;
2243
2244 utext_close(&input);
2245 utext_close(&re);
2246 }
2247
2248 //
2249 // find with zero length matches, match position should bump ahead
2250 // to prevent loops.
2251 //
2252 {
2253 int32_t i;
2254 UErrorCode status=U_ZERO_ERROR;
2255 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2256 // using an always-true look-ahead.
2257 REGEX_CHECK_STATUS;
2258 UText s = UTEXT_INITIALIZER;
2259 utext_openUTF8(&s, " ", -1, &status);
2260 m.reset(&s);
2261 for (i=0; ; i++) {
2262 if (m.find() == FALSE) {
2263 break;
2264 }
2265 REGEX_ASSERT(m.start(status) == i);
2266 REGEX_ASSERT(m.end(status) == i);
2267 }
2268 REGEX_ASSERT(i==5);
2269
2270 // Check that the bump goes over characters outside the BMP OK
2271 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2274 m.reset(&s);
2275 for (i=0; ; i+=4) {
2276 if (m.find() == FALSE) {
2277 break;
2278 }
2279 REGEX_ASSERT(m.start(status) == i);
2280 REGEX_ASSERT(m.end(status) == i);
2281 }
2282 REGEX_ASSERT(i==20);
2283
2284 utext_close(&s);
2285 }
2286 {
2287 // find() loop breaking test.
2288 // with pattern of /.?/, should see a series of one char matches, then a single
2289 // match of zero length at the end of the input string.
2290 int32_t i;
2291 UErrorCode status=U_ZERO_ERROR;
2292 RegexMatcher m(".?", 0, status);
2293 REGEX_CHECK_STATUS;
2294 UText s = UTEXT_INITIALIZER;
2295 utext_openUTF8(&s, " ", -1, &status);
2296 m.reset(&s);
2297 for (i=0; ; i++) {
2298 if (m.find() == FALSE) {
2299 break;
2300 }
2301 REGEX_ASSERT(m.start(status) == i);
2302 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2303 }
2304 REGEX_ASSERT(i==5);
2305
2306 utext_close(&s);
2307 }
2308
2309
2310 //
2311 // Matchers with no input string behave as if they had an empty input string.
2312 //
2313
2314 {
2315 UErrorCode status = U_ZERO_ERROR;
2316 RegexMatcher m(".?", 0, status);
2317 REGEX_CHECK_STATUS;
2318 REGEX_ASSERT(m.find());
2319 REGEX_ASSERT(m.start(status) == 0);
2320 REGEX_ASSERT(m.input() == "");
2321 }
2322 {
2323 UErrorCode status = U_ZERO_ERROR;
2324 RegexPattern *p = RegexPattern::compile(".", 0, status);
2325 RegexMatcher *m = p->matcher(status);
2326 REGEX_CHECK_STATUS;
2327
2328 REGEX_ASSERT(m->find() == FALSE);
2329 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2330 delete m;
2331 delete p;
2332 }
2333
2334 //
2335 // Regions
2336 //
2337 {
2338 UErrorCode status = U_ZERO_ERROR;
2339 UText testPattern = UTEXT_INITIALIZER;
2340 UText testText = UTEXT_INITIALIZER;
2341 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2342 REGEX_VERBOSE_TEXT(&testPattern);
2343 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2344 REGEX_VERBOSE_TEXT(&testText);
2345
2346 RegexMatcher m(&testPattern, &testText, 0, status);
2347 REGEX_CHECK_STATUS;
2348 REGEX_ASSERT(m.regionStart() == 0);
2349 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2350 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2351 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352
2353 m.region(2,4, status);
2354 REGEX_CHECK_STATUS;
2355 REGEX_ASSERT(m.matches(status));
2356 REGEX_ASSERT(m.start(status)==2);
2357 REGEX_ASSERT(m.end(status)==4);
2358 REGEX_CHECK_STATUS;
2359
2360 m.reset();
2361 REGEX_ASSERT(m.regionStart() == 0);
2362 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2363
2364 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2365 REGEX_VERBOSE_TEXT(&testText);
2366 m.reset(&testText);
2367 REGEX_ASSERT(m.regionStart() == 0);
2368 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2369
2370 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2371 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2372 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2373 REGEX_ASSERT(&m == &m.reset());
2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375
2376 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2377 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2378 REGEX_ASSERT(&m == &m.reset());
2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380
2381 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2382 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2383 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2384 REGEX_ASSERT(&m == &m.reset());
2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386
2387 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2388 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2389 REGEX_ASSERT(&m == &m.reset());
2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391
2392 utext_close(&testText);
2393 utext_close(&testPattern);
2394 }
2395
2396 //
2397 // hitEnd() and requireEnd()
2398 //
2399 {
2400 UErrorCode status = U_ZERO_ERROR;
2401 UText testPattern = UTEXT_INITIALIZER;
2402 UText testText = UTEXT_INITIALIZER;
2403 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405 utext_openUTF8(&testPattern, str_, -1, &status);
2406 utext_openUTF8(&testText, str_aabb, -1, &status);
2407
2408 RegexMatcher m1(&testPattern, &testText, 0, status);
2409 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2410 REGEX_ASSERT(m1.hitEnd() == TRUE);
2411 REGEX_ASSERT(m1.requireEnd() == FALSE);
2412 REGEX_CHECK_STATUS;
2413
2414 status = U_ZERO_ERROR;
2415 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416 utext_openUTF8(&testPattern, str_a, -1, &status);
2417 RegexMatcher m2(&testPattern, &testText, 0, status);
2418 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2419 REGEX_ASSERT(m2.hitEnd() == FALSE);
2420 REGEX_ASSERT(m2.requireEnd() == FALSE);
2421 REGEX_CHECK_STATUS;
2422
2423 status = U_ZERO_ERROR;
2424 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2426 RegexMatcher m3(&testPattern, &testText, 0, status);
2427 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2428 REGEX_ASSERT(m3.hitEnd() == TRUE);
2429 REGEX_ASSERT(m3.requireEnd() == TRUE);
2430 REGEX_CHECK_STATUS;
2431
2432 utext_close(&testText);
2433 utext_close(&testPattern);
2434 }
2435 }
2436
2437
2438 //---------------------------------------------------------------------------
2439 //
2440 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2441 // Replace family of functions.
2442 //
2443 //---------------------------------------------------------------------------
2444 void RegexTest::API_Replace_UTF8() {
2445 //
2446 // Replace
2447 //
2448 int32_t flags=0;
2449 UParseError pe;
2450 UErrorCode status=U_ZERO_ERROR;
2451
2452 UText re=UTEXT_INITIALIZER;
2453 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2454 REGEX_VERBOSE_TEXT(&re);
2455 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2456 REGEX_CHECK_STATUS;
2457
2458 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459 // 012345678901234567
2460 UText dataText = UTEXT_INITIALIZER;
2461 utext_openUTF8(&dataText, data, -1, &status);
2462 REGEX_CHECK_STATUS;
2463 REGEX_VERBOSE_TEXT(&dataText);
2464 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2465
2466 //
2467 // Plain vanilla matches.
2468 //
2469 UnicodeString dest;
2470 UText destText = UTEXT_INITIALIZER;
2471 utext_openUnicodeString(&destText, &dest, &status);
2472 UText *result;
2473
2474 UText replText = UTEXT_INITIALIZER;
2475
2476 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477 utext_openUTF8(&replText, str_yz, -1, &status);
2478 REGEX_VERBOSE_TEXT(&replText);
2479 result = matcher->replaceFirst(&replText, NULL, status);
2480 REGEX_CHECK_STATUS;
2481 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2483 utext_close(result);
2484 result = matcher->replaceFirst(&replText, &destText, status);
2485 REGEX_CHECK_STATUS;
2486 REGEX_ASSERT(result == &destText);
2487 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2488
2489 result = matcher->replaceAll(&replText, NULL, status);
2490 REGEX_CHECK_STATUS;
2491 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2493 utext_close(result);
2494
2495 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496 result = matcher->replaceAll(&replText, &destText, status);
2497 REGEX_CHECK_STATUS;
2498 REGEX_ASSERT(result == &destText);
2499 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2500
2501 //
2502 // Plain vanilla non-matches.
2503 //
2504 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2506 matcher->reset(&dataText);
2507
2508 result = matcher->replaceFirst(&replText, NULL, status);
2509 REGEX_CHECK_STATUS;
2510 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2511 utext_close(result);
2512 result = matcher->replaceFirst(&replText, &destText, status);
2513 REGEX_CHECK_STATUS;
2514 REGEX_ASSERT(result == &destText);
2515 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2516
2517 result = matcher->replaceAll(&replText, NULL, status);
2518 REGEX_CHECK_STATUS;
2519 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2520 utext_close(result);
2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522 result = matcher->replaceAll(&replText, &destText, status);
2523 REGEX_CHECK_STATUS;
2524 REGEX_ASSERT(result == &destText);
2525 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2526
2527 //
2528 // Empty source string
2529 //
2530 utext_openUTF8(&dataText, NULL, 0, &status);
2531 matcher->reset(&dataText);
2532
2533 result = matcher->replaceFirst(&replText, NULL, status);
2534 REGEX_CHECK_STATUS;
2535 REGEX_ASSERT_UTEXT_UTF8("", result);
2536 utext_close(result);
2537 result = matcher->replaceFirst(&replText, &destText, status);
2538 REGEX_CHECK_STATUS;
2539 REGEX_ASSERT(result == &destText);
2540 REGEX_ASSERT_UTEXT_UTF8("", result);
2541
2542 result = matcher->replaceAll(&replText, NULL, status);
2543 REGEX_CHECK_STATUS;
2544 REGEX_ASSERT_UTEXT_UTF8("", result);
2545 utext_close(result);
2546 result = matcher->replaceAll(&replText, &destText, status);
2547 REGEX_CHECK_STATUS;
2548 REGEX_ASSERT(result == &destText);
2549 REGEX_ASSERT_UTEXT_UTF8("", result);
2550
2551 //
2552 // Empty substitution string
2553 //
2554 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2555 matcher->reset(&dataText);
2556
2557 utext_openUTF8(&replText, NULL, 0, &status);
2558 result = matcher->replaceFirst(&replText, NULL, status);
2559 REGEX_CHECK_STATUS;
2560 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2562 utext_close(result);
2563 result = matcher->replaceFirst(&replText, &destText, status);
2564 REGEX_CHECK_STATUS;
2565 REGEX_ASSERT(result == &destText);
2566 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2567
2568 result = matcher->replaceAll(&replText, NULL, status);
2569 REGEX_CHECK_STATUS;
2570 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2572 utext_close(result);
2573 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2574 result = matcher->replaceAll(&replText, &destText, status);
2575 REGEX_CHECK_STATUS;
2576 REGEX_ASSERT(result == &destText);
2577 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2578
2579 //
2580 // match whole string
2581 //
2582 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583 utext_openUTF8(&dataText, str_abc, -1, &status);
2584 matcher->reset(&dataText);
2585
2586 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587 utext_openUTF8(&replText, str_xyz, -1, &status);
2588 result = matcher->replaceFirst(&replText, NULL, status);
2589 REGEX_CHECK_STATUS;
2590 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2591 utext_close(result);
2592 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593 result = matcher->replaceFirst(&replText, &destText, status);
2594 REGEX_CHECK_STATUS;
2595 REGEX_ASSERT(result == &destText);
2596 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2597
2598 result = matcher->replaceAll(&replText, NULL, status);
2599 REGEX_CHECK_STATUS;
2600 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2601 utext_close(result);
2602 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603 result = matcher->replaceAll(&replText, &destText, status);
2604 REGEX_CHECK_STATUS;
2605 REGEX_ASSERT(result == &destText);
2606 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2607
2608 //
2609 // Capture Group, simple case
2610 //
2611 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612 utext_openUTF8(&re, str_add, -1, &status);
2613 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2614 REGEX_CHECK_STATUS;
2615
2616 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2618 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2619 REGEX_CHECK_STATUS;
2620
2621 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622 utext_openUTF8(&replText, str_11, -1, &status);
2623 result = matcher2->replaceFirst(&replText, NULL, status);
2624 REGEX_CHECK_STATUS;
2625 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2627 utext_close(result);
2628 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629 result = matcher2->replaceFirst(&replText, &destText, status);
2630 REGEX_CHECK_STATUS;
2631 REGEX_ASSERT(result == &destText);
2632 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2633
2634 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635 utext_openUTF8(&replText, str_v, -1, &status);
2636 REGEX_VERBOSE_TEXT(&replText);
2637 result = matcher2->replaceFirst(&replText, NULL, status);
2638 REGEX_CHECK_STATUS;
2639 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2641 utext_close(result);
2642 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643 result = matcher2->replaceFirst(&replText, &destText, status);
2644 REGEX_CHECK_STATUS;
2645 REGEX_ASSERT(result == &destText);
2646 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2647
2648 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2652 result = matcher2->replaceFirst(&replText, NULL, status);
2653 REGEX_CHECK_STATUS;
2654 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2656 utext_close(result);
2657 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2658 result = matcher2->replaceFirst(&replText, &destText, status);
2659 REGEX_CHECK_STATUS;
2660 REGEX_ASSERT(result == &destText);
2661 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2662
2663 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665 // 012345678901234567890123456
2666 supplDigitChars[22] = 0xF0;
2667 supplDigitChars[23] = 0x9D;
2668 supplDigitChars[24] = 0x9F;
2669 supplDigitChars[25] = 0x8F;
2670 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2671
2672 result = matcher2->replaceFirst(&replText, NULL, status);
2673 REGEX_CHECK_STATUS;
2674 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2676 utext_close(result);
2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678 result = matcher2->replaceFirst(&replText, &destText, status);
2679 REGEX_CHECK_STATUS;
2680 REGEX_ASSERT(result == &destText);
2681 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2682 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2683 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2684 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2685 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686 utext_close(result);
2687 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2688 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2689 REGEX_ASSERT(result == &destText);
2690 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2691
2692 //
2693 // Replacement String with \u hex escapes
2694 //
2695 {
2696 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2699 utext_openUTF8(&replText, str_u0043, -1, &status);
2700 matcher->reset(&dataText);
2701
2702 result = matcher->replaceAll(&replText, NULL, status);
2703 REGEX_CHECK_STATUS;
2704 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2706 utext_close(result);
2707 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2708 result = matcher->replaceAll(&replText, &destText, status);
2709 REGEX_CHECK_STATUS;
2710 REGEX_ASSERT(result == &destText);
2711 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2712 }
2713 {
2714 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715 utext_openUTF8(&dataText, str_abc, -1, &status);
2716 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717 utext_openUTF8(&replText, str_U00010000, -1, &status);
2718 matcher->reset(&dataText);
2719
2720 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2721 // 0123456789
2722 expected[2] = 0xF0;
2723 expected[3] = 0x90;
2724 expected[4] = 0x80;
2725 expected[5] = 0x80;
2726
2727 result = matcher->replaceAll(&replText, NULL, status);
2728 REGEX_CHECK_STATUS;
2729 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2730 utext_close(result);
2731 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2732 result = matcher->replaceAll(&replText, &destText, status);
2733 REGEX_CHECK_STATUS;
2734 REGEX_ASSERT(result == &destText);
2735 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2736 }
2737 // TODO: need more through testing of capture substitutions.
2738
2739 // Bug 4057
2740 //
2741 {
2742 status = U_ZERO_ERROR;
2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746 utext_openUTF8(&re, str_ssee, -1, &status);
2747 utext_openUTF8(&dataText, str_blah, -1, &status);
2748 utext_openUTF8(&replText, str_ooh, -1, &status);
2749
2750 RegexMatcher m(&re, 0, status);
2751 REGEX_CHECK_STATUS;
2752
2753 UnicodeString result;
2754 UText resultText = UTEXT_INITIALIZER;
2755 utext_openUnicodeString(&resultText, &result, &status);
2756
2757 // Multiple finds do NOT bump up the previous appendReplacement postion.
2758 m.reset(&dataText);
2759 m.find();
2760 m.find();
2761 m.appendReplacement(&resultText, &replText, status);
2762 REGEX_CHECK_STATUS;
2763 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2765
2766 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767 status = U_ZERO_ERROR;
2768 result.truncate(0);
2769 utext_openUnicodeString(&resultText, &result, &status);
2770 m.reset(10, status);
2771 m.find();
2772 m.find();
2773 m.appendReplacement(&resultText, &replText, status);
2774 REGEX_CHECK_STATUS;
2775 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2777
2778 // find() at interior of string, appendReplacement still starts at beginning.
2779 status = U_ZERO_ERROR;
2780 result.truncate(0);
2781 utext_openUnicodeString(&resultText, &result, &status);
2782 m.reset();
2783 m.find(10, status);
2784 m.find();
2785 m.appendReplacement(&resultText, &replText, status);
2786 REGEX_CHECK_STATUS;
2787 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2789
2790 m.appendTail(&resultText, status);
2791 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2793
2794 utext_close(&resultText);
2795 }
2796
2797 delete matcher2;
2798 delete pat2;
2799 delete matcher;
2800 delete pat;
2801
2802 utext_close(&dataText);
2803 utext_close(&replText);
2804 utext_close(&destText);
2805 utext_close(&re);
2806 }
2807
2808
2809 //---------------------------------------------------------------------------
2810 //
2811 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2812 // present and nominally working.
2813 //
2814 //---------------------------------------------------------------------------
2815 void RegexTest::API_Pattern_UTF8() {
2816 RegexPattern pata; // Test default constructor to not crash.
2817 RegexPattern patb;
2818
2819 REGEX_ASSERT(pata == patb);
2820 REGEX_ASSERT(pata == pata);
2821
2822 UText re1 = UTEXT_INITIALIZER;
2823 UText re2 = UTEXT_INITIALIZER;
2824 UErrorCode status = U_ZERO_ERROR;
2825 UParseError pe;
2826
2827 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2830 utext_openUTF8(&re2, str_def, -1, &status);
2831
2832 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2833 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2834 REGEX_CHECK_STATUS;
2835 REGEX_ASSERT(*pat1 == *pat1);
2836 REGEX_ASSERT(*pat1 != pata);
2837
2838 // Assign
2839 patb = *pat1;
2840 REGEX_ASSERT(patb == *pat1);
2841
2842 // Copy Construct
2843 RegexPattern patc(*pat1);
2844 REGEX_ASSERT(patc == *pat1);
2845 REGEX_ASSERT(patb == patc);
2846 REGEX_ASSERT(pat1 != pat2);
2847 patb = *pat2;
2848 REGEX_ASSERT(patb != patc);
2849 REGEX_ASSERT(patb == *pat2);
2850
2851 // Compile with no flags.
2852 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2853 REGEX_ASSERT(*pat1a == *pat1);
2854
2855 REGEX_ASSERT(pat1a->flags() == 0);
2856
2857 // Compile with different flags should be not equal
2858 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2859 REGEX_CHECK_STATUS;
2860
2861 REGEX_ASSERT(*pat1b != *pat1a);
2862 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2863 REGEX_ASSERT(pat1a->flags() == 0);
2864 delete pat1b;
2865
2866 // clone
2867 RegexPattern *pat1c = pat1->clone();
2868 REGEX_ASSERT(*pat1c == *pat1);
2869 REGEX_ASSERT(*pat1c != *pat2);
2870
2871 delete pat1c;
2872 delete pat1a;
2873 delete pat1;
2874 delete pat2;
2875
2876 utext_close(&re1);
2877 utext_close(&re2);
2878
2879
2880 //
2881 // Verify that a matcher created from a cloned pattern works.
2882 // (Jitterbug 3423)
2883 //
2884 {
2885 UErrorCode status = U_ZERO_ERROR;
2886 UText pattern = UTEXT_INITIALIZER;
2887 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888 utext_openUTF8(&pattern, str_pL, -1, &status);
2889
2890 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2891 RegexPattern *pClone = pSource->clone();
2892 delete pSource;
2893 RegexMatcher *mFromClone = pClone->matcher(status);
2894 REGEX_CHECK_STATUS;
2895
2896 UText input = UTEXT_INITIALIZER;
2897 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2899 mFromClone->reset(&input);
2900 REGEX_ASSERT(mFromClone->find() == TRUE);
2901 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2902 REGEX_ASSERT(mFromClone->find() == TRUE);
2903 REGEX_ASSERT(mFromClone->group(status) == "World");
2904 REGEX_ASSERT(mFromClone->find() == FALSE);
2905 delete mFromClone;
2906 delete pClone;
2907
2908 utext_close(&input);
2909 utext_close(&pattern);
2910 }
2911
2912 //
2913 // matches convenience API
2914 //
2915 {
2916 UErrorCode status = U_ZERO_ERROR;
2917 UText pattern = UTEXT_INITIALIZER;
2918 UText input = UTEXT_INITIALIZER;
2919
2920 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921 utext_openUTF8(&input, str_randominput, -1, &status);
2922
2923 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2925 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2926 REGEX_CHECK_STATUS;
2927
2928 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929 utext_openUTF8(&pattern, str_abc, -1, &status);
2930 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2931 REGEX_CHECK_STATUS;
2932
2933 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934 utext_openUTF8(&pattern, str_nput, -1, &status);
2935 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2936 REGEX_CHECK_STATUS;
2937
2938 utext_openUTF8(&pattern, str_randominput, -1, &status);
2939 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2940 REGEX_CHECK_STATUS;
2941
2942 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943 utext_openUTF8(&pattern, str_u, -1, &status);
2944 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2945 REGEX_CHECK_STATUS;
2946
2947 utext_openUTF8(&input, str_abc, -1, &status);
2948 utext_openUTF8(&pattern, str_abc, -1, &status);
2949 status = U_INDEX_OUTOFBOUNDS_ERROR;
2950 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2951 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2952
2953 utext_close(&input);
2954 utext_close(&pattern);
2955 }
2956
2957
2958 //
2959 // Split()
2960 //
2961 status = U_ZERO_ERROR;
2962 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2963 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2964 pat1 = RegexPattern::compile(&re1, pe, status);
2965 REGEX_CHECK_STATUS;
2966 UnicodeString fields[10];
2967
2968 int32_t n;
2969 n = pat1->split("Now is the time", fields, 10, status);
2970 REGEX_CHECK_STATUS;
2971 REGEX_ASSERT(n==4);
2972 REGEX_ASSERT(fields[0]=="Now");
2973 REGEX_ASSERT(fields[1]=="is");
2974 REGEX_ASSERT(fields[2]=="the");
2975 REGEX_ASSERT(fields[3]=="time");
2976 REGEX_ASSERT(fields[4]=="");
2977
2978 n = pat1->split("Now is the time", fields, 2, status);
2979 REGEX_CHECK_STATUS;
2980 REGEX_ASSERT(n==2);
2981 REGEX_ASSERT(fields[0]=="Now");
2982 REGEX_ASSERT(fields[1]=="is the time");
2983 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2984
2985 fields[1] = "*";
2986 status = U_ZERO_ERROR;
2987 n = pat1->split("Now is the time", fields, 1, status);
2988 REGEX_CHECK_STATUS;
2989 REGEX_ASSERT(n==1);
2990 REGEX_ASSERT(fields[0]=="Now is the time");
2991 REGEX_ASSERT(fields[1]=="*");
2992 status = U_ZERO_ERROR;
2993
2994 n = pat1->split(" Now is the time ", fields, 10, status);
2995 REGEX_CHECK_STATUS;
2996 REGEX_ASSERT(n==6);
2997 REGEX_ASSERT(fields[0]=="");
2998 REGEX_ASSERT(fields[1]=="Now");
2999 REGEX_ASSERT(fields[2]=="is");
3000 REGEX_ASSERT(fields[3]=="the");
3001 REGEX_ASSERT(fields[4]=="time");
3002 REGEX_ASSERT(fields[5]=="");
3003 REGEX_ASSERT(fields[6]=="");
3004
3005 fields[2] = "*";
3006 n = pat1->split(" ", fields, 10, status);
3007 REGEX_CHECK_STATUS;
3008 REGEX_ASSERT(n==2);
3009 REGEX_ASSERT(fields[0]=="");
3010 REGEX_ASSERT(fields[1]=="");
3011 REGEX_ASSERT(fields[2]=="*");
3012
3013 fields[0] = "foo";
3014 n = pat1->split("", fields, 10, status);
3015 REGEX_CHECK_STATUS;
3016 REGEX_ASSERT(n==0);
3017 REGEX_ASSERT(fields[0]=="foo");
3018
3019 delete pat1;
3020
3021 // split, with a pattern with (capture)
3022 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3023 pat1 = RegexPattern::compile(&re1, pe, status);
3024 REGEX_CHECK_STATUS;
3025
3026 status = U_ZERO_ERROR;
3027 fields[6] = fields[7] = "*";
3028 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3029 REGEX_CHECK_STATUS;
3030 REGEX_ASSERT(n==7);
3031 REGEX_ASSERT(fields[0]=="");
3032 REGEX_ASSERT(fields[1]=="a");
3033 REGEX_ASSERT(fields[2]=="Now is ");
3034 REGEX_ASSERT(fields[3]=="b");
3035 REGEX_ASSERT(fields[4]=="the time");
3036 REGEX_ASSERT(fields[5]=="c");
3037 REGEX_ASSERT(fields[6]=="");
3038 REGEX_ASSERT(fields[7]=="*");
3039 REGEX_ASSERT(status==U_ZERO_ERROR);
3040
3041 fields[6] = fields[7] = "*";
3042 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3043 REGEX_CHECK_STATUS;
3044 REGEX_ASSERT(n==7);
3045 REGEX_ASSERT(fields[0]==" ");
3046 REGEX_ASSERT(fields[1]=="a");
3047 REGEX_ASSERT(fields[2]=="Now is ");
3048 REGEX_ASSERT(fields[3]=="b");
3049 REGEX_ASSERT(fields[4]=="the time");
3050 REGEX_ASSERT(fields[5]=="c");
3051 REGEX_ASSERT(fields[6]=="");
3052 REGEX_ASSERT(fields[7]=="*");
3053
3054 status = U_ZERO_ERROR;
3055 fields[6] = "foo";
3056 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3057 REGEX_CHECK_STATUS;
3058 REGEX_ASSERT(n==6);
3059 REGEX_ASSERT(fields[0]==" ");
3060 REGEX_ASSERT(fields[1]=="a");
3061 REGEX_ASSERT(fields[2]=="Now is ");
3062 REGEX_ASSERT(fields[3]=="b");
3063 REGEX_ASSERT(fields[4]=="the time");
3064 REGEX_ASSERT(fields[5]==" ");
3065 REGEX_ASSERT(fields[6]=="foo");
3066
3067 status = U_ZERO_ERROR;
3068 fields[5] = "foo";
3069 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3070 REGEX_CHECK_STATUS;
3071 REGEX_ASSERT(n==5);
3072 REGEX_ASSERT(fields[0]==" ");
3073 REGEX_ASSERT(fields[1]=="a");
3074 REGEX_ASSERT(fields[2]=="Now is ");
3075 REGEX_ASSERT(fields[3]=="b");
3076 REGEX_ASSERT(fields[4]=="the time<c>");
3077 REGEX_ASSERT(fields[5]=="foo");
3078
3079 status = U_ZERO_ERROR;
3080 fields[5] = "foo";
3081 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3082 REGEX_CHECK_STATUS;
3083 REGEX_ASSERT(n==5);
3084 REGEX_ASSERT(fields[0]==" ");
3085 REGEX_ASSERT(fields[1]=="a");
3086 REGEX_ASSERT(fields[2]=="Now is ");
3087 REGEX_ASSERT(fields[3]=="b");
3088 REGEX_ASSERT(fields[4]=="the time");
3089 REGEX_ASSERT(fields[5]=="foo");
3090
3091 status = U_ZERO_ERROR;
3092 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3093 REGEX_CHECK_STATUS;
3094 REGEX_ASSERT(n==4);
3095 REGEX_ASSERT(fields[0]==" ");
3096 REGEX_ASSERT(fields[1]=="a");
3097 REGEX_ASSERT(fields[2]=="Now is ");
3098 REGEX_ASSERT(fields[3]=="the time<c>");
3099 status = U_ZERO_ERROR;
3100 delete pat1;
3101
3102 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3103 pat1 = RegexPattern::compile(&re1, pe, status);
3104 REGEX_CHECK_STATUS;
3105 n = pat1->split("1-10,20", fields, 10, status);
3106 REGEX_CHECK_STATUS;
3107 REGEX_ASSERT(n==5);
3108 REGEX_ASSERT(fields[0]=="1");
3109 REGEX_ASSERT(fields[1]=="-");
3110 REGEX_ASSERT(fields[2]=="10");
3111 REGEX_ASSERT(fields[3]==",");
3112 REGEX_ASSERT(fields[4]=="20");
3113 delete pat1;
3114
3115
3116 //
3117 // split of a UText based string, with library allocating output UTexts.
3118 //
3119 {
3120 status = U_ZERO_ERROR;
3121 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122 UnicodeString stringToSplit("first:second:third");
3123 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3124 REGEX_CHECK_STATUS;
3125
3126 UText *splits[10] = {NULL};
3127 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3128 REGEX_CHECK_STATUS;
3129 REGEX_ASSERT(numFields == 5);
3130 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135 REGEX_ASSERT(splits[5] == NULL);
3136
3137 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138 if (splits[i]) {
3139 utext_close(splits[i]);
3140 splits[i] = NULL;
3141 }
3142 }
3143 utext_close(textToSplit);
3144 }
3145
3146
3147 //
3148 // RegexPattern::pattern() and patternText()
3149 //
3150 pat1 = new RegexPattern();
3151 REGEX_ASSERT(pat1->pattern() == "");
3152 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3153 delete pat1;
3154 const char *helloWorldInvariant = "(Hello, world)*";
3155 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3156 pat1 = RegexPattern::compile(&re1, pe, status);
3157 REGEX_CHECK_STATUS;
3158 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3159 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3160 delete pat1;
3161
3162 utext_close(&re1);
3163 }
3164
3165
3166 //---------------------------------------------------------------------------
3167 //
3168 // Extended A more thorough check for features of regex patterns
3169 // The test cases are in a separate data file,
3170 // source/tests/testdata/regextst.txt
3171 // A description of the test data format is included in that file.
3172 //
3173 //---------------------------------------------------------------------------
3174
3175 const char *
3176 RegexTest::getPath(char buffer[2048], const char *filename) {
3177 UErrorCode status=U_ZERO_ERROR;
3178 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3179 if (U_FAILURE(status)) {
3180 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3181 return NULL;
3182 }
3183
3184 strcpy(buffer, testDataDirectory);
3185 strcat(buffer, filename);
3186 return buffer;
3187 }
3188
3189 void RegexTest::Extended() {
3190 char tdd[2048];
3191 const char *srcPath;
3192 UErrorCode status = U_ZERO_ERROR;
3193 int32_t lineNum = 0;
3194
3195 //
3196 // Open and read the test data file.
3197 //
3198 srcPath=getPath(tdd, "regextst.txt");
3199 if(srcPath==NULL) {
3200 return; /* something went wrong, error already output */
3201 }
3202
3203 int32_t len;
3204 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3205 if (U_FAILURE(status)) {
3206 return; /* something went wrong, error already output */
3207 }
3208
3209 //
3210 // Put the test data into a UnicodeString
3211 //
3212 UnicodeString testString(FALSE, testData, len);
3213
3214 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3215 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3216 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3217
3218 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3219 UnicodeString testPattern; // The pattern for test from the test file.
3220 UnicodeString testFlags; // the flags for a test.
3221 UnicodeString matchString; // The marked up string to be used as input
3222
3223 if (U_FAILURE(status)){
3224 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3225 delete [] testData;
3226 return;
3227 }
3228
3229 //
3230 // Loop over the test data file, once per line.
3231 //
3232 while (lineMat.find()) {
3233 lineNum++;
3234 if (U_FAILURE(status)) {
3235 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3236 }
3237
3238 status = U_ZERO_ERROR;
3239 UnicodeString testLine = lineMat.group(1, status);
3240 if (testLine.length() == 0) {
3241 continue;
3242 }
3243
3244 //
3245 // Parse the test line. Skip blank and comment only lines.
3246 // Separate out the three main fields - pattern, flags, target.
3247 //
3248
3249 commentMat.reset(testLine);
3250 if (commentMat.lookingAt(status)) {
3251 // This line is a comment, or blank.
3252 continue;
3253 }
3254
3255 //
3256 // Pull out the pattern field, remove it from the test file line.
3257 //
3258 quotedStuffMat.reset(testLine);
3259 if (quotedStuffMat.lookingAt(status)) {
3260 testPattern = quotedStuffMat.group(2, status);
3261 testLine.remove(0, quotedStuffMat.end(0, status));
3262 } else {
3263 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3264 continue;
3265 }
3266
3267
3268 //
3269 // Pull out the flags from the test file line.
3270 //
3271 flagsMat.reset(testLine);
3272 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3273 testFlags = flagsMat.group(1, status);
3274 if (flagsMat.group(2, status).length() > 0) {
3275 errln("Bad Match flag at line %d. Scanning %c\n",
3276 lineNum, flagsMat.group(2, status).charAt(0));
3277 continue;
3278 }
3279 testLine.remove(0, flagsMat.end(0, status));
3280
3281 //
3282 // Pull out the match string, as a whole.
3283 // We'll process the <tags> later.
3284 //
3285 quotedStuffMat.reset(testLine);
3286 if (quotedStuffMat.lookingAt(status)) {
3287 matchString = quotedStuffMat.group(2, status);
3288 testLine.remove(0, quotedStuffMat.end(0, status));
3289 } else {
3290 errln("Bad match string at test file line %d", lineNum);
3291 continue;
3292 }
3293
3294 //
3295 // The only thing left from the input line should be an optional trailing comment.
3296 //
3297 commentMat.reset(testLine);
3298 if (commentMat.lookingAt(status) == FALSE) {
3299 errln("Line %d: unexpected characters at end of test line.", lineNum);
3300 continue;
3301 }
3302
3303 //
3304 // Run the test
3305 //
3306 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3307 }
3308
3309 delete [] testData;
3310
3311 }
3312
3313
3314
3315 //---------------------------------------------------------------------------
3316 //
3317 // regex_find(pattern, flags, inputString, lineNumber)
3318 //
3319 // Function to run a single test from the Extended (data driven) tests.
3320 // See file test/testdata/regextst.txt for a description of the
3321 // pattern and inputString fields, and the allowed flags.
3322 // lineNumber is the source line in regextst.txt of the test.
3323 //
3324 //---------------------------------------------------------------------------
3325
3326
3327 // Set a value into a UVector at position specified by a decimal number in
3328 // a UnicodeString. This is a utility function needed by the actual test function,
3329 // which follows.
3330 static void set(UVector &vec, int32_t val, UnicodeString index) {
3331 UErrorCode status=U_ZERO_ERROR;
3332 int32_t idx = 0;
3333 for (int32_t i=0; i<index.length(); i++) {
3334 int32_t d=u_charDigitValue(index.charAt(i));
3335 if (d<0) {return;}
3336 idx = idx*10 + d;
3337 }
3338 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3339 vec.setElementAt(val, idx);
3340 }
3341
3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3343 UErrorCode status=U_ZERO_ERROR;
3344 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3345 vec.setElementAt(val, idx);
3346 }
3347
3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3349 {
3350 UBool couldFind = TRUE;
3351 UTEXT_SETNATIVEINDEX(utext, 0);
3352 int32_t i = 0;
3353 while (i < unistrOffset) {
3354 UChar32 c = UTEXT_NEXT32(utext);
3355 if (c != U_SENTINEL) {
3356 i += U16_LENGTH(c);
3357 } else {
3358 couldFind = FALSE;
3359 break;
3360 }
3361 }
3362 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3363 return couldFind;
3364 }
3365
3366
3367 void RegexTest::regex_find(const UnicodeString &pattern,
3368 const UnicodeString &flags,
3369 const UnicodeString &inputString,
3370 const char *srcPath,
3371 int32_t line) {
3372 UnicodeString unEscapedInput;
3373 UnicodeString deTaggedInput;
3374
3375 int32_t patternUTF8Length, inputUTF8Length;
3376 char *patternChars = NULL, *inputChars = NULL;
3377 UText patternText = UTEXT_INITIALIZER;
3378 UText inputText = UTEXT_INITIALIZER;
3379 UConverter *UTF8Converter = NULL;
3380
3381 UErrorCode status = U_ZERO_ERROR;
3382 UParseError pe;
3383 RegexPattern *parsePat = NULL;
3384 RegexMatcher *parseMatcher = NULL;
3385 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3386 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3387 UVector groupStarts(status);
3388 UVector groupEnds(status);
3389 UVector groupStartsUTF8(status);
3390 UVector groupEndsUTF8(status);
3391 UBool isMatch = FALSE, isUTF8Match = FALSE;
3392 UBool failed = FALSE;
3393 int32_t numFinds;
3394 int32_t i;
3395 UBool useMatchesFunc = FALSE;
3396 UBool useLookingAtFunc = FALSE;
3397 int32_t regionStart = -1;
3398 int32_t regionEnd = -1;
3399 int32_t regionStartUTF8 = -1;
3400 int32_t regionEndUTF8 = -1;
3401
3402
3403 //
3404 // Compile the caller's pattern
3405 //
3406 uint32_t bflags = 0;
3407 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3408 bflags |= UREGEX_CASE_INSENSITIVE;
3409 }
3410 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3411 bflags |= UREGEX_COMMENTS;
3412 }
3413 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3414 bflags |= UREGEX_DOTALL;
3415 }
3416 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3417 bflags |= UREGEX_MULTILINE;
3418 }
3419
3420 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3421 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3422 }
3423 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3424 bflags |= UREGEX_UNIX_LINES;
3425 }
3426 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3427 bflags |= UREGEX_LITERAL;
3428 }
3429
3430
3431 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3432 if (status != U_ZERO_ERROR) {
3433 #if UCONFIG_NO_BREAK_ITERATION==1
3434 // 'v' test flag means that the test pattern should not compile if ICU was configured
3435 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3436 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3437 goto cleanupAndReturn;
3438 }
3439 #endif
3440 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3441 // Expected pattern compilation error.
3442 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3443 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3444 }
3445 goto cleanupAndReturn;
3446 } else {
3447 // Unexpected pattern compilation error.
3448 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3449 goto cleanupAndReturn;
3450 }
3451 }
3452
3453 UTF8Converter = ucnv_open("UTF8", &status);
3454 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3455
3456 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3457 status = U_ZERO_ERROR; // buffer overflow
3458 patternChars = new char[patternUTF8Length+1];
3459 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3460 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3461
3462 if (status == U_ZERO_ERROR) {
3463 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3464
3465 if (status != U_ZERO_ERROR) {
3466 #if UCONFIG_NO_BREAK_ITERATION==1
3467 // 'v' test flag means that the test pattern should not compile if ICU was configured
3468 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3469 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3470 goto cleanupAndReturn;
3471 }
3472 #endif
3473 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3474 // Expected pattern compilation error.
3475 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3476 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3477 }
3478 goto cleanupAndReturn;
3479 } else {
3480 // Unexpected pattern compilation error.
3481 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3482 goto cleanupAndReturn;
3483 }
3484 }
3485 }
3486
3487 if (UTF8Pattern == NULL) {
3488 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3490 status = U_ZERO_ERROR;
3491 }
3492
3493 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3494 callerPattern->dumpPattern();
3495 }
3496
3497 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3498 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3499 goto cleanupAndReturn;
3500 }
3501
3502
3503 //
3504 // Number of times find() should be called on the test string, default to 1
3505 //
3506 numFinds = 1;
3507 for (i=2; i<=9; i++) {
3508 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3509 if (numFinds != 1) {
3510 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3511 goto cleanupAndReturn;
3512 }
3513 numFinds = i;
3514 }
3515 }
3516
3517 // 'M' flag. Use matches() instead of find()
3518 if (flags.indexOf((UChar)0x4d) >= 0) {
3519 useMatchesFunc = TRUE;
3520 }
3521 if (flags.indexOf((UChar)0x4c) >= 0) {
3522 useLookingAtFunc = TRUE;
3523 }
3524
3525 //
3526 // Find the tags in the input data, remove them, and record the group boundary
3527 // positions.
3528 //
3529 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3530 REGEX_CHECK_STATUS_L(line);
3531
3532 unEscapedInput = inputString.unescape();
3533 parseMatcher = parsePat->matcher(unEscapedInput, status);
3534 REGEX_CHECK_STATUS_L(line);
3535 while(parseMatcher->find()) {
3536 parseMatcher->appendReplacement(deTaggedInput, "", status);
3537 REGEX_CHECK_STATUS;
3538 UnicodeString groupNum = parseMatcher->group(2, status);
3539 if (groupNum == "r") {
3540 // <r> or </r>, a region specification within the string
3541 if (parseMatcher->group(1, status) == "/") {
3542 regionEnd = deTaggedInput.length();
3543 } else {
3544 regionStart = deTaggedInput.length();
3545 }
3546 } else {
3547 // <digits> or </digits>, a group match boundary tag.
3548 if (parseMatcher->group(1, status) == "/") {
3549 set(groupEnds, deTaggedInput.length(), groupNum);
3550 } else {
3551 set(groupStarts, deTaggedInput.length(), groupNum);
3552 }
3553 }
3554 }
3555 parseMatcher->appendTail(deTaggedInput);
3556 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3557 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3558 errln("mismatched <r> tags");
3559 failed = TRUE;
3560 goto cleanupAndReturn;
3561 }
3562
3563 //
3564 // Configure the matcher according to the flags specified with this test.
3565 //
3566 matcher = callerPattern->matcher(deTaggedInput, status);
3567 REGEX_CHECK_STATUS_L(line);
3568 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3569 matcher->setTrace(TRUE);
3570 }
3571
3572 if (UTF8Pattern != NULL) {
3573 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3574 status = U_ZERO_ERROR; // buffer overflow
3575 inputChars = new char[inputUTF8Length+1];
3576 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3577 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3578
3579 if (status == U_ZERO_ERROR) {
3580 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3581 REGEX_CHECK_STATUS_L(line);
3582 }
3583
3584 if (UTF8Matcher == NULL) {
3585 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3587 status = U_ZERO_ERROR;
3588 }
3589 }
3590
3591 //
3592 // Generate native indices for UTF8 versions of region and capture group info
3593 //
3594 if (UTF8Matcher != NULL) {
3595 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3596 UTF8Matcher->setTrace(TRUE);
3597 }
3598 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3599 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3600
3601 // Fill out the native index UVector info.
3602 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3603 for (i=0; i<groupStarts.size(); i++) {
3604 int32_t start = groupStarts.elementAti(i);
3605 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3606 if (start >= 0) {
3607 int32_t startUTF8;
3608 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3609 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3610 failed = TRUE;
3611 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3612 }
3613 setInt(groupStartsUTF8, startUTF8, i);
3614 }
3615
3616 int32_t end = groupEnds.elementAti(i);
3617 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3618 if (end >= 0) {
3619 int32_t endUTF8;
3620 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3621 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3622 failed = TRUE;
3623 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3624 }
3625 setInt(groupEndsUTF8, endUTF8, i);
3626 }
3627 }
3628 }
3629
3630 if (regionStart>=0) {
3631 matcher->region(regionStart, regionEnd, status);
3632 REGEX_CHECK_STATUS_L(line);
3633 if (UTF8Matcher != NULL) {
3634 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3635 REGEX_CHECK_STATUS_L(line);
3636 }
3637 }
3638 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3639 matcher->useAnchoringBounds(FALSE);
3640 if (UTF8Matcher != NULL) {
3641 UTF8Matcher->useAnchoringBounds(FALSE);
3642 }
3643 }
3644 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3645 matcher->useTransparentBounds(TRUE);
3646 if (UTF8Matcher != NULL) {
3647 UTF8Matcher->useTransparentBounds(TRUE);
3648 }
3649 }
3650
3651
3652
3653 //
3654 // Do a find on the de-tagged input using the caller's pattern
3655 // TODO: error on count>1 and not find().
3656 // error on both matches() and lookingAt().
3657 //
3658 for (i=0; i<numFinds; i++) {
3659 if (useMatchesFunc) {
3660 isMatch = matcher->matches(status);
3661 if (UTF8Matcher != NULL) {
3662 isUTF8Match = UTF8Matcher->matches(status);
3663 }
3664 } else if (useLookingAtFunc) {
3665 isMatch = matcher->lookingAt(status);
3666 if (UTF8Matcher != NULL) {
3667 isUTF8Match = UTF8Matcher->lookingAt(status);
3668 }
3669 } else {
3670 isMatch = matcher->find();
3671 if (UTF8Matcher != NULL) {
3672 isUTF8Match = UTF8Matcher->find();
3673 }
3674 }
3675 }
3676 matcher->setTrace(FALSE);
3677 if (UTF8Matcher) {
3678 UTF8Matcher->setTrace(FALSE);
3679 }
3680 if (U_FAILURE(status)) {
3681 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3682 }
3683
3684 //
3685 // Match up the groups from the find() with the groups from the tags
3686 //
3687
3688 // number of tags should match number of groups from find operation.
3689 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3690 // G option in test means that capture group data is not available in the
3691 // expected results, so the check needs to be suppressed.
3692 if (isMatch == FALSE && groupStarts.size() != 0) {
3693 dataerrln("Error at line %d: Match expected, but none found.", line);
3694 failed = TRUE;
3695 goto cleanupAndReturn;
3696 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3697 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3698 failed = TRUE;
3699 goto cleanupAndReturn;
3700 }
3701 if (isMatch && groupStarts.size() == 0) {
3702 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3703 failed = TRUE;
3704 }
3705 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3706 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3707 failed = TRUE;
3708 }
3709
3710 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3711 // Only check for match / no match. Don't check capture groups.
3712 goto cleanupAndReturn;
3713 }
3714
3715 REGEX_CHECK_STATUS_L(line);
3716 for (i=0; i<=matcher->groupCount(); i++) {
3717 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3718 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3719 if (matcher->start(i, status) != expectedStart) {
3720 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3721 line, i, expectedStart, matcher->start(i, status));
3722 failed = TRUE;
3723 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3724 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3725 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3726 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3727 failed = TRUE;
3728 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3729 }
3730
3731 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3732 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3733 if (matcher->end(i, status) != expectedEnd) {
3734 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3735 line, i, expectedEnd, matcher->end(i, status));
3736 failed = TRUE;
3737 // Error on end position; keep going; real error is probably yet to come as group
3738 // end positions work from end of the input data towards the front.
3739 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3740 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3741 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3742 failed = TRUE;
3743 // Error on end position; keep going; real error is probably yet to come as group
3744 // end positions work from end of the input data towards the front.
3745 }
3746 }
3747 if ( matcher->groupCount()+1 < groupStarts.size()) {
3748 errln("Error at line %d: Expected %d capture groups, found %d.",
3749 line, groupStarts.size()-1, matcher->groupCount());
3750 failed = TRUE;
3751 }
3752 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3753 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3754 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3755 failed = TRUE;
3756 }
3757
3758 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3759 matcher->requireEnd() == TRUE) {
3760 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3761 failed = TRUE;
3762 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3763 UTF8Matcher->requireEnd() == TRUE) {
3764 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3765 failed = TRUE;
3766 }
3767
3768 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3769 matcher->requireEnd() == FALSE) {
3770 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3771 failed = TRUE;
3772 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3773 UTF8Matcher->requireEnd() == FALSE) {
3774 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3775 failed = TRUE;
3776 }
3777
3778 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3779 matcher->hitEnd() == TRUE) {
3780 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3781 failed = TRUE;
3782 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3783 UTF8Matcher->hitEnd() == TRUE) {
3784 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3785 failed = TRUE;
3786 }
3787
3788 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3789 matcher->hitEnd() == FALSE) {
3790 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3791 failed = TRUE;
3792 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3793 UTF8Matcher->hitEnd() == FALSE) {
3794 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3795 failed = TRUE;
3796 }
3797
3798
3799 cleanupAndReturn:
3800 if (failed) {
3801 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3802 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3803 // callerPattern->dump();
3804 }
3805 delete parseMatcher;
3806 delete parsePat;
3807 delete UTF8Matcher;
3808 delete UTF8Pattern;
3809 delete matcher;
3810 delete callerPattern;
3811
3812 utext_close(&inputText);
3813 delete[] inputChars;
3814 utext_close(&patternText);
3815 delete[] patternChars;
3816 ucnv_close(UTF8Converter);
3817 }
3818
3819
3820
3821
3822 //---------------------------------------------------------------------------
3823 //
3824 // Errors Check for error handling in patterns.
3825 //
3826 //---------------------------------------------------------------------------
3827 void RegexTest::Errors() {
3828 // \escape sequences that aren't implemented yet.
3829 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3830
3831 // Missing close parentheses
3832 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3833 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3834 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3835
3836 // Extra close paren
3837 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3838 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3839 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3840
3841 // Look-ahead, Look-behind
3842 // TODO: add tests for unbounded length look-behinds.
3843 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3844
3845 // Attempt to use non-default flags
3846 {
3847 UParseError pe;
3848 UErrorCode status = U_ZERO_ERROR;
3849 int32_t flags = UREGEX_CANON_EQ |
3850 UREGEX_COMMENTS | UREGEX_DOTALL |
3851 UREGEX_MULTILINE;
3852 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3853 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3854 delete pat1;
3855 }
3856
3857
3858 // Quantifiers are allowed only after something that can be quantified.
3859 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3860 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3861 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3862
3863 // Mal-formed {min,max} quantifiers
3864 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3865 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3866 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3867 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3868 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3869 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3870 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3871 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3872 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3873
3874 // Ticket 5389
3875 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3876
3877 // Invalid Back Reference \0
3878 // For ICU 3.8 and earlier
3879 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3880 //
3881 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3882
3883 }
3884
3885
3886 //-------------------------------------------------------------------------------
3887 //
3888 // Read a text data file, convert it to UChars, and return the data
3889 // in one big UChar * buffer, which the caller must delete.
3890 //
3891 //--------------------------------------------------------------------------------
3892 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3893 const char *defEncoding, UErrorCode &status) {
3894 UChar *retPtr = NULL;
3895 char *fileBuf = NULL;
3896 UConverter* conv = NULL;
3897 FILE *f = NULL;
3898
3899 ulen = 0;
3900 if (U_FAILURE(status)) {
3901 return retPtr;
3902 }
3903
3904 //
3905 // Open the file.
3906 //
3907 f = fopen(fileName, "rb");
3908 if (f == 0) {
3909 dataerrln("Error opening test data file %s\n", fileName);
3910 status = U_FILE_ACCESS_ERROR;
3911 return NULL;
3912 }
3913 //
3914 // Read it in
3915 //
3916 int32_t fileSize;
3917 int32_t amt_read;
3918
3919 fseek( f, 0, SEEK_END);
3920 fileSize = ftell(f);
3921 fileBuf = new char[fileSize];
3922 fseek(f, 0, SEEK_SET);
3923 amt_read = fread(fileBuf, 1, fileSize, f);
3924 if (amt_read != fileSize || fileSize <= 0) {
3925 errln("Error reading test data file.");
3926 goto cleanUpAndReturn;
3927 }
3928
3929 //
3930 // Look for a Unicode Signature (BOM) on the data just read
3931 //
3932 int32_t signatureLength;
3933 const char * fileBufC;
3934 const char* encoding;
3935
3936 fileBufC = fileBuf;
3937 encoding = ucnv_detectUnicodeSignature(
3938 fileBuf, fileSize, &signatureLength, &status);
3939 if(encoding!=NULL ){
3940 fileBufC += signatureLength;
3941 fileSize -= signatureLength;
3942 } else {
3943 encoding = defEncoding;
3944 if (strcmp(encoding, "utf-8") == 0) {
3945 errln("file %s is missing its BOM", fileName);
3946 }
3947 }
3948
3949 //
3950 // Open a converter to take the rule file to UTF-16
3951 //
3952 conv = ucnv_open(encoding, &status);
3953 if (U_FAILURE(status)) {
3954 goto cleanUpAndReturn;
3955 }
3956
3957 //
3958 // Convert the rules to UChar.
3959 // Preflight first to determine required buffer size.
3960 //
3961 ulen = ucnv_toUChars(conv,
3962 NULL, // dest,
3963 0, // destCapacity,
3964 fileBufC,
3965 fileSize,
3966 &status);
3967 if (status == U_BUFFER_OVERFLOW_ERROR) {
3968 // Buffer Overflow is expected from the preflight operation.
3969 status = U_ZERO_ERROR;
3970
3971 retPtr = new UChar[ulen+1];
3972 ucnv_toUChars(conv,
3973 retPtr, // dest,
3974 ulen+1,
3975 fileBufC,
3976 fileSize,
3977 &status);
3978 }
3979
3980 cleanUpAndReturn:
3981 fclose(f);
3982 delete[] fileBuf;
3983 ucnv_close(conv);
3984 if (U_FAILURE(status)) {
3985 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3986 delete []retPtr;
3987 retPtr = 0;
3988 ulen = 0;
3989 };
3990 return retPtr;
3991 }
3992
3993
3994 //-------------------------------------------------------------------------------
3995 //
3996 // PerlTests - Run Perl's regular expression tests
3997 // The input file for this test is re_tests, the standard regular
3998 // expression test data distributed with the Perl source code.
3999 //
4000 // Here is Perl's description of the test data file:
4001 //
4002 // # The tests are in a separate file 't/op/re_tests'.
4003 // # Each line in that file is a separate test.
4004 // # There are five columns, separated by tabs.
4005 // #
4006 // # Column 1 contains the pattern, optionally enclosed in C<''>.
4007 // # Modifiers can be put after the closing C<'>.
4008 // #
4009 // # Column 2 contains the string to be matched.
4010 // #
4011 // # Column 3 contains the expected result:
4012 // # y expect a match
4013 // # n expect no match
4014 // # c expect an error
4015 // # B test exposes a known bug in Perl, should be skipped
4016 // # b test exposes a known bug in Perl, should be skipped if noamp
4017 // #
4018 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4019 // #
4020 // # Column 4 contains a string, usually C<$&>.
4021 // #
4022 // # Column 5 contains the expected result of double-quote
4023 // # interpolating that string after the match, or start of error message.
4024 // #
4025 // # Column 6, if present, contains a reason why the test is skipped.
4026 // # This is printed with "skipped", for harness to pick up.
4027 // #
4028 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4029 // #
4030 // # If you want to add a regular expression test that can't be expressed
4031 // # in this format, don't add it here: put it in op/pat.t instead.
4032 //
4033 // For ICU, if field 3 contains an 'i', the test will be skipped.
4034 // The test exposes is some known incompatibility between ICU and Perl regexps.
4035 // (The i is in addition to whatever was there before.)
4036 //
4037 //-------------------------------------------------------------------------------
4038 void RegexTest::PerlTests() {
4039 char tdd[2048];
4040 const char *srcPath;
4041 UErrorCode status = U_ZERO_ERROR;
4042 UParseError pe;
4043
4044 //
4045 // Open and read the test data file.
4046 //
4047 srcPath=getPath(tdd, "re_tests.txt");
4048 if(srcPath==NULL) {
4049 return; /* something went wrong, error already output */
4050 }
4051
4052 int32_t len;
4053 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4054 if (U_FAILURE(status)) {
4055 return; /* something went wrong, error already output */
4056 }
4057
4058 //
4059 // Put the test data into a UnicodeString
4060 //
4061 UnicodeString testDataString(FALSE, testData, len);
4062
4063 //
4064 // Regex to break the input file into lines, and strip the new lines.
4065 // One line per match, capture group one is the desired data.
4066 //
4067 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4068 if (U_FAILURE(status)) {
4069 dataerrln("RegexPattern::compile() error");
4070 return;
4071 }
4072 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4073
4074 //
4075 // Regex to split a test file line into fields.
4076 // There are six fields, separated by tabs.
4077 //
4078 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4079
4080 //
4081 // Regex to identify test patterns with flag settings, and to separate them.
4082 // Test patterns with flags look like 'pattern'i
4083 // Test patterns without flags are not quoted: pattern
4084 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4085 //
4086 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4087 RegexMatcher* flagMat = flagPat->matcher(status);
4088
4089 //
4090 // The Perl tests reference several perl-isms, which are evaluated/substituted
4091 // in the test data. Not being perl, this must be done explicitly. Here
4092 // are string constants and REs for these constructs.
4093 //
4094 UnicodeString nulnulSrc("${nulnul}");
4095 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4096 nulnul = nulnul.unescape();
4097
4098 UnicodeString ffffSrc("${ffff}");
4099 UnicodeString ffff("\\uffff", -1, US_INV);
4100 ffff = ffff.unescape();
4101
4102 // regexp for $-[0], $+[2], etc.
4103 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4104 RegexMatcher *groupsMat = groupsPat->matcher(status);
4105
4106 // regexp for $0, $1, $2, etc.
4107 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4108 RegexMatcher *cgMat = cgPat->matcher(status);
4109
4110
4111 //
4112 // Main Loop for the Perl Tests, runs once per line from the
4113 // test data file.
4114 //
4115 int32_t lineNum = 0;
4116 int32_t skippedUnimplementedCount = 0;
4117 while (lineMat->find()) {
4118 lineNum++;
4119
4120 //
4121 // Get a line, break it into its fields, do the Perl
4122 // variable substitutions.
4123 //
4124 UnicodeString line = lineMat->group(1, status);
4125 UnicodeString fields[7];
4126 fieldPat->split(line, fields, 7, status);
4127
4128 flagMat->reset(fields[0]);
4129 flagMat->matches(status);
4130 UnicodeString pattern = flagMat->group(2, status);
4131 pattern.findAndReplace("${bang}", "!");
4132 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4133 pattern.findAndReplace(ffffSrc, ffff);
4134
4135 //
4136 // Identify patterns that include match flag settings,
4137 // split off the flags, remove the extra quotes.
4138 //
4139 UnicodeString flagStr = flagMat->group(3, status);
4140 if (U_FAILURE(status)) {
4141 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4142 return;
4143 }
4144 int32_t flags = 0;
4145 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4146 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4147 const UChar UChar_m = 0x6d;
4148 const UChar UChar_x = 0x78;
4149 const UChar UChar_y = 0x79;
4150 if (flagStr.indexOf(UChar_i) != -1) {
4151 flags |= UREGEX_CASE_INSENSITIVE;
4152 }
4153 if (flagStr.indexOf(UChar_m) != -1) {
4154 flags |= UREGEX_MULTILINE;
4155 }
4156 if (flagStr.indexOf(UChar_x) != -1) {
4157 flags |= UREGEX_COMMENTS;
4158 }
4159
4160 //
4161 // Compile the test pattern.
4162 //
4163 status = U_ZERO_ERROR;
4164 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4165 if (status == U_REGEX_UNIMPLEMENTED) {
4166 //
4167 // Test of a feature that is planned for ICU, but not yet implemented.
4168 // skip the test.
4169 skippedUnimplementedCount++;
4170 delete testPat;
4171 status = U_ZERO_ERROR;
4172 continue;
4173 }
4174
4175 if (U_FAILURE(status)) {
4176 // Some tests are supposed to generate errors.
4177 // Only report an error for tests that are supposed to succeed.
4178 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4179 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4180 {
4181 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4182 }
4183 status = U_ZERO_ERROR;
4184 delete testPat;
4185 continue;
4186 }
4187
4188 if (fields[2].indexOf(UChar_i) >= 0) {
4189 // ICU should skip this test.
4190 delete testPat;
4191 continue;
4192 }
4193
4194 if (fields[2].indexOf(UChar_c) >= 0) {
4195 // This pattern should have caused a compilation error, but didn't/
4196 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4197 delete testPat;
4198 continue;
4199 }
4200
4201 //
4202 // replace the Perl variables that appear in some of the
4203 // match data strings.
4204 //
4205 UnicodeString matchString = fields[1];
4206 matchString.findAndReplace(nulnulSrc, nulnul);
4207 matchString.findAndReplace(ffffSrc, ffff);
4208
4209 // Replace any \n in the match string with an actual new-line char.
4210 // Don't do full unescape, as this unescapes more than Perl does, which
4211 // causes other spurious failures in the tests.
4212 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4213
4214
4215
4216 //
4217 // Run the test, check for expected match/don't match result.
4218 //
4219 RegexMatcher *testMat = testPat->matcher(matchString, status);
4220 UBool found = testMat->find();
4221 UBool expected = FALSE;
4222 if (fields[2].indexOf(UChar_y) >=0) {
4223 expected = TRUE;
4224 }
4225 if (expected != found) {
4226 errln("line %d: Expected %smatch, got %smatch",
4227 lineNum, expected?"":"no ", found?"":"no " );
4228 continue;
4229 }
4230
4231 // Don't try to check expected results if there is no match.
4232 // (Some have stuff in the expected fields)
4233 if (!found) {
4234 delete testMat;
4235 delete testPat;
4236 continue;
4237 }
4238
4239 //
4240 // Interpret the Perl expression from the fourth field of the data file,
4241 // building up an ICU string from the results of the ICU match.
4242 // The Perl expression will contain references to the results of
4243 // a regex match, including the matched string, capture group strings,
4244 // group starting and ending indicies, etc.
4245 //
4246 UnicodeString resultString;
4247 UnicodeString perlExpr = fields[3];
4248 #if SUPPORT_MUTATING_INPUT_STRING
4249 groupsMat->reset(perlExpr);
4250 cgMat->reset(perlExpr);
4251 #endif
4252
4253 while (perlExpr.length() > 0) {
4254 #if !SUPPORT_MUTATING_INPUT_STRING
4255 // Perferred usage. Reset after any modification to input string.
4256 groupsMat->reset(perlExpr);
4257 cgMat->reset(perlExpr);
4258 #endif
4259
4260 if (perlExpr.startsWith("$&")) {
4261 resultString.append(testMat->group(status));
4262 perlExpr.remove(0, 2);
4263 }
4264
4265 else if (groupsMat->lookingAt(status)) {
4266 // $-[0] $+[2] etc.
4267 UnicodeString digitString = groupsMat->group(2, status);
4268 int32_t t = 0;
4269 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4270 UnicodeString plusOrMinus = groupsMat->group(1, status);
4271 int32_t matchPosition;
4272 if (plusOrMinus.compare("+") == 0) {
4273 matchPosition = testMat->end(groupNum, status);
4274 } else {
4275 matchPosition = testMat->start(groupNum, status);
4276 }
4277 if (matchPosition != -1) {
4278 ICU_Utility::appendNumber(resultString, matchPosition);
4279 }
4280 perlExpr.remove(0, groupsMat->end(status));
4281 }
4282
4283 else if (cgMat->lookingAt(status)) {
4284 // $1, $2, $3, etc.
4285 UnicodeString digitString = cgMat->group(1, status);
4286 int32_t t = 0;
4287 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4288 if (U_SUCCESS(status)) {
4289 resultString.append(testMat->group(groupNum, status));
4290 status = U_ZERO_ERROR;
4291 }
4292 perlExpr.remove(0, cgMat->end(status));
4293 }
4294
4295 else if (perlExpr.startsWith("@-")) {
4296 int32_t i;
4297 for (i=0; i<=testMat->groupCount(); i++) {
4298 if (i>0) {
4299 resultString.append(" ");
4300 }
4301 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4302 }
4303 perlExpr.remove(0, 2);
4304 }
4305
4306 else if (perlExpr.startsWith("@+")) {
4307 int32_t i;
4308 for (i=0; i<=testMat->groupCount(); i++) {
4309 if (i>0) {
4310 resultString.append(" ");
4311 }
4312 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4313 }
4314 perlExpr.remove(0, 2);
4315 }
4316
4317 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4318 // or as an escaped sequence (e.g. \n)
4319 if (perlExpr.length() > 1) {
4320 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4321 }
4322 UChar c = perlExpr.charAt(0);
4323 switch (c) {
4324 case 'n': c = '\n'; break;
4325 // add any other escape sequences that show up in the test expected results.
4326 }
4327 resultString.append(c);
4328 perlExpr.remove(0, 1);
4329 }
4330
4331 else {
4332 // Any characters from the perl expression that we don't explicitly
4333 // recognize before here are assumed to be literals and copied
4334 // as-is to the expected results.
4335 resultString.append(perlExpr.charAt(0));
4336 perlExpr.remove(0, 1);
4337 }
4338
4339 if (U_FAILURE(status)) {
4340 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4341 break;
4342 }
4343 }
4344
4345 //
4346 // Expected Results Compare
4347 //
4348 UnicodeString expectedS(fields[4]);
4349 expectedS.findAndReplace(nulnulSrc, nulnul);
4350 expectedS.findAndReplace(ffffSrc, ffff);
4351 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4352
4353
4354 if (expectedS.compare(resultString) != 0) {
4355 err("Line %d: Incorrect perl expression results.", lineNum);
4356 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4357 }
4358
4359 delete testMat;
4360 delete testPat;
4361 }
4362
4363 //
4364 // All done. Clean up allocated stuff.
4365 //
4366 delete cgMat;
4367 delete cgPat;
4368
4369 delete groupsMat;
4370 delete groupsPat;
4371
4372 delete flagMat;
4373 delete flagPat;
4374
4375 delete lineMat;
4376 delete linePat;
4377
4378 delete fieldPat;
4379 delete [] testData;
4380
4381
4382 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4383
4384 }
4385
4386
4387 //-------------------------------------------------------------------------------
4388 //
4389 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4390 // (instead of using UnicodeStrings) to test the alternate engine.
4391 // The input file for this test is re_tests, the standard regular
4392 // expression test data distributed with the Perl source code.
4393 // See PerlTests() for more information.
4394 //
4395 //-------------------------------------------------------------------------------
4396 void RegexTest::PerlTestsUTF8() {
4397 char tdd[2048];
4398 const char *srcPath;
4399 UErrorCode status = U_ZERO_ERROR;
4400 UParseError pe;
4401 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4402 UText patternText = UTEXT_INITIALIZER;
4403 char *patternChars = NULL;
4404 int32_t patternLength;
4405 int32_t patternCapacity = 0;
4406 UText inputText = UTEXT_INITIALIZER;
4407 char *inputChars = NULL;
4408 int32_t inputLength;
4409 int32_t inputCapacity = 0;
4410
4411 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4412
4413 //
4414 // Open and read the test data file.
4415 //
4416 srcPath=getPath(tdd, "re_tests.txt");
4417 if(srcPath==NULL) {
4418 return; /* something went wrong, error already output */
4419 }
4420
4421 int32_t len;
4422 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4423 if (U_FAILURE(status)) {
4424 return; /* something went wrong, error already output */
4425 }
4426
4427 //
4428 // Put the test data into a UnicodeString
4429 //
4430 UnicodeString testDataString(FALSE, testData, len);
4431
4432 //
4433 // Regex to break the input file into lines, and strip the new lines.
4434 // One line per match, capture group one is the desired data.
4435 //
4436 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4437 if (U_FAILURE(status)) {
4438 dataerrln("RegexPattern::compile() error");
4439 return;
4440 }
4441 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4442
4443 //
4444 // Regex to split a test file line into fields.
4445 // There are six fields, separated by tabs.
4446 //
4447 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4448
4449 //
4450 // Regex to identify test patterns with flag settings, and to separate them.
4451 // Test patterns with flags look like 'pattern'i
4452 // Test patterns without flags are not quoted: pattern
4453 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4454 //
4455 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4456 RegexMatcher* flagMat = flagPat->matcher(status);
4457
4458 //
4459 // The Perl tests reference several perl-isms, which are evaluated/substituted
4460 // in the test data. Not being perl, this must be done explicitly. Here
4461 // are string constants and REs for these constructs.
4462 //
4463 UnicodeString nulnulSrc("${nulnul}");
4464 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4465 nulnul = nulnul.unescape();
4466
4467 UnicodeString ffffSrc("${ffff}");
4468 UnicodeString ffff("\\uffff", -1, US_INV);
4469 ffff = ffff.unescape();
4470
4471 // regexp for $-[0], $+[2], etc.
4472 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4473 RegexMatcher *groupsMat = groupsPat->matcher(status);
4474
4475 // regexp for $0, $1, $2, etc.
4476 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4477 RegexMatcher *cgMat = cgPat->matcher(status);
4478
4479
4480 //
4481 // Main Loop for the Perl Tests, runs once per line from the
4482 // test data file.
4483 //
4484 int32_t lineNum = 0;
4485 int32_t skippedUnimplementedCount = 0;
4486 while (lineMat->find()) {
4487 lineNum++;
4488
4489 //
4490 // Get a line, break it into its fields, do the Perl
4491 // variable substitutions.
4492 //
4493 UnicodeString line = lineMat->group(1, status);
4494 UnicodeString fields[7];
4495 fieldPat->split(line, fields, 7, status);
4496
4497 flagMat->reset(fields[0]);
4498 flagMat->matches(status);
4499 UnicodeString pattern = flagMat->group(2, status);
4500 pattern.findAndReplace("${bang}", "!");
4501 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4502 pattern.findAndReplace(ffffSrc, ffff);
4503
4504 //
4505 // Identify patterns that include match flag settings,
4506 // split off the flags, remove the extra quotes.
4507 //
4508 UnicodeString flagStr = flagMat->group(3, status);
4509 if (U_FAILURE(status)) {
4510 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4511 return;
4512 }
4513 int32_t flags = 0;
4514 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4515 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4516 const UChar UChar_m = 0x6d;
4517 const UChar UChar_x = 0x78;
4518 const UChar UChar_y = 0x79;
4519 if (flagStr.indexOf(UChar_i) != -1) {
4520 flags |= UREGEX_CASE_INSENSITIVE;
4521 }
4522 if (flagStr.indexOf(UChar_m) != -1) {
4523 flags |= UREGEX_MULTILINE;
4524 }
4525 if (flagStr.indexOf(UChar_x) != -1) {
4526 flags |= UREGEX_COMMENTS;
4527 }
4528
4529 //
4530 // Put the pattern in a UTF-8 UText
4531 //
4532 status = U_ZERO_ERROR;
4533 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4534 if (status == U_BUFFER_OVERFLOW_ERROR) {
4535 status = U_ZERO_ERROR;
4536 delete[] patternChars;
4537 patternCapacity = patternLength + 1;
4538 patternChars = new char[patternCapacity];
4539 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4540 }
4541 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4542
4543 //
4544 // Compile the test pattern.
4545 //
4546 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4547 if (status == U_REGEX_UNIMPLEMENTED) {
4548 //
4549 // Test of a feature that is planned for ICU, but not yet implemented.
4550 // skip the test.
4551 skippedUnimplementedCount++;
4552 delete testPat;
4553 status = U_ZERO_ERROR;
4554 continue;
4555 }
4556
4557 if (U_FAILURE(status)) {
4558 // Some tests are supposed to generate errors.
4559 // Only report an error for tests that are supposed to succeed.
4560 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4561 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4562 {
4563 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4564 }
4565 status = U_ZERO_ERROR;
4566 delete testPat;
4567 continue;
4568 }
4569
4570 if (fields[2].indexOf(UChar_i) >= 0) {
4571 // ICU should skip this test.
4572 delete testPat;
4573 continue;
4574 }
4575
4576 if (fields[2].indexOf(UChar_c) >= 0) {
4577 // This pattern should have caused a compilation error, but didn't/
4578 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4579 delete testPat;
4580 continue;
4581 }
4582
4583
4584 //
4585 // replace the Perl variables that appear in some of the
4586 // match data strings.
4587 //
4588 UnicodeString matchString = fields[1];
4589 matchString.findAndReplace(nulnulSrc, nulnul);
4590 matchString.findAndReplace(ffffSrc, ffff);
4591
4592 // Replace any \n in the match string with an actual new-line char.
4593 // Don't do full unescape, as this unescapes more than Perl does, which
4594 // causes other spurious failures in the tests.
4595 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4596
4597 //
4598 // Put the input in a UTF-8 UText
4599 //
4600 status = U_ZERO_ERROR;
4601 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4602 if (status == U_BUFFER_OVERFLOW_ERROR) {
4603 status = U_ZERO_ERROR;
4604 delete[] inputChars;
4605 inputCapacity = inputLength + 1;
4606 inputChars = new char[inputCapacity];
4607 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4608 }
4609 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4610
4611 //
4612 // Run the test, check for expected match/don't match result.
4613 //
4614 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4615 UBool found = testMat->find();
4616 UBool expected = FALSE;
4617 if (fields[2].indexOf(UChar_y) >=0) {
4618 expected = TRUE;
4619 }
4620 if (expected != found) {
4621 errln("line %d: Expected %smatch, got %smatch",
4622 lineNum, expected?"":"no ", found?"":"no " );
4623 continue;
4624 }
4625
4626 // Don't try to check expected results if there is no match.
4627 // (Some have stuff in the expected fields)
4628 if (!found) {
4629 delete testMat;
4630 delete testPat;
4631 continue;
4632 }
4633
4634 //
4635 // Interpret the Perl expression from the fourth field of the data file,
4636 // building up an ICU string from the results of the ICU match.
4637 // The Perl expression will contain references to the results of
4638 // a regex match, including the matched string, capture group strings,
4639 // group starting and ending indicies, etc.
4640 //
4641 UnicodeString resultString;
4642 UnicodeString perlExpr = fields[3];
4643
4644 while (perlExpr.length() > 0) {
4645 groupsMat->reset(perlExpr);
4646 cgMat->reset(perlExpr);
4647
4648 if (perlExpr.startsWith("$&")) {
4649 resultString.append(testMat->group(status));
4650 perlExpr.remove(0, 2);
4651 }
4652
4653 else if (groupsMat->lookingAt(status)) {
4654 // $-[0] $+[2] etc.
4655 UnicodeString digitString = groupsMat->group(2, status);
4656 int32_t t = 0;
4657 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4658 UnicodeString plusOrMinus = groupsMat->group(1, status);
4659 int32_t matchPosition;
4660 if (plusOrMinus.compare("+") == 0) {
4661 matchPosition = testMat->end(groupNum, status);
4662 } else {
4663 matchPosition = testMat->start(groupNum, status);
4664 }
4665 if (matchPosition != -1) {
4666 ICU_Utility::appendNumber(resultString, matchPosition);
4667 }
4668 perlExpr.remove(0, groupsMat->end(status));
4669 }
4670
4671 else if (cgMat->lookingAt(status)) {
4672 // $1, $2, $3, etc.
4673 UnicodeString digitString = cgMat->group(1, status);
4674 int32_t t = 0;
4675 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4676 if (U_SUCCESS(status)) {
4677 resultString.append(testMat->group(groupNum, status));
4678 status = U_ZERO_ERROR;
4679 }
4680 perlExpr.remove(0, cgMat->end(status));
4681 }
4682
4683 else if (perlExpr.startsWith("@-")) {
4684 int32_t i;
4685 for (i=0; i<=testMat->groupCount(); i++) {
4686 if (i>0) {
4687 resultString.append(" ");
4688 }
4689 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4690 }
4691 perlExpr.remove(0, 2);
4692 }
4693
4694 else if (perlExpr.startsWith("@+")) {
4695 int32_t i;
4696 for (i=0; i<=testMat->groupCount(); i++) {
4697 if (i>0) {
4698 resultString.append(" ");
4699 }
4700 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4701 }
4702 perlExpr.remove(0, 2);
4703 }
4704
4705 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4706 // or as an escaped sequence (e.g. \n)
4707 if (perlExpr.length() > 1) {
4708 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4709 }
4710 UChar c = perlExpr.charAt(0);
4711 switch (c) {
4712 case 'n': c = '\n'; break;
4713 // add any other escape sequences that show up in the test expected results.
4714 }
4715 resultString.append(c);
4716 perlExpr.remove(0, 1);
4717 }
4718
4719 else {
4720 // Any characters from the perl expression that we don't explicitly
4721 // recognize before here are assumed to be literals and copied
4722 // as-is to the expected results.
4723 resultString.append(perlExpr.charAt(0));
4724 perlExpr.remove(0, 1);
4725 }
4726
4727 if (U_FAILURE(status)) {
4728 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4729 break;
4730 }
4731 }
4732
4733 //
4734 // Expected Results Compare
4735 //
4736 UnicodeString expectedS(fields[4]);
4737 expectedS.findAndReplace(nulnulSrc, nulnul);
4738 expectedS.findAndReplace(ffffSrc, ffff);
4739 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4740
4741
4742 if (expectedS.compare(resultString) != 0) {
4743 err("Line %d: Incorrect perl expression results.", lineNum);
4744 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4745 }
4746
4747 delete testMat;
4748 delete testPat;
4749 }
4750
4751 //
4752 // All done. Clean up allocated stuff.
4753 //
4754 delete cgMat;
4755 delete cgPat;
4756
4757 delete groupsMat;
4758 delete groupsPat;
4759
4760 delete flagMat;
4761 delete flagPat;
4762
4763 delete lineMat;
4764 delete linePat;
4765
4766 delete fieldPat;
4767 delete [] testData;
4768
4769 utext_close(&patternText);
4770 utext_close(&inputText);
4771
4772 delete [] patternChars;
4773 delete [] inputChars;
4774
4775
4776 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4777
4778 }
4779
4780
4781 //--------------------------------------------------------------
4782 //
4783 // Bug6149 Verify limits to heap expansion for backtrack stack.
4784 // Use this pattern,
4785 // "(a?){1,8000000}"
4786 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4787 // This test is likely to be fragile, as further optimizations stop
4788 // more cases of pointless looping in the match engine.
4789 //
4790 //---------------------------------------------------------------
4791 void RegexTest::Bug6149() {
4792 UnicodeString pattern("(a?){1,8000000}");
4793 UnicodeString s("xyz");
4794 uint32_t flags = 0;
4795 UErrorCode status = U_ZERO_ERROR;
4796
4797 RegexMatcher matcher(pattern, s, flags, status);
4798 UBool result = false;
4799 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4800 REGEX_ASSERT(result == FALSE);
4801 }
4802
4803
4804 //
4805 // Callbacks() Test the callback function.
4806 // When set, callbacks occur periodically during matching operations,
4807 // giving the application code the ability to abort the operation
4808 // before it's normal completion.
4809 //
4810
4811 struct callBackContext {
4812 RegexTest *test;
4813 int32_t maxCalls;
4814 int32_t numCalls;
4815 int32_t lastSteps;
4816 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4817 };
4818
4819 U_CDECL_BEGIN
4820 static UBool U_CALLCONV
4821 testCallBackFn(const void *context, int32_t steps) {
4822 callBackContext *info = (callBackContext *)context;
4823 if (info->lastSteps+1 != steps) {
4824 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4825 }
4826 info->lastSteps = steps;
4827 info->numCalls++;
4828 return (info->numCalls < info->maxCalls);
4829 }
4830 U_CDECL_END
4831
4832 void RegexTest::Callbacks() {
4833 {
4834 // Getter returns NULLs if no callback has been set
4835
4836 // The variables that the getter will fill in.
4837 // Init to non-null values so that the action of the getter can be seen.
4838 const void *returnedContext = &returnedContext;
4839 URegexMatchCallback *returnedFn = &testCallBackFn;
4840
4841 UErrorCode status = U_ZERO_ERROR;
4842 RegexMatcher matcher("x", 0, status);
4843 REGEX_CHECK_STATUS;
4844 matcher.getMatchCallback(returnedFn, returnedContext, status);
4845 REGEX_CHECK_STATUS;
4846 REGEX_ASSERT(returnedFn == NULL);
4847 REGEX_ASSERT(returnedContext == NULL);
4848 }
4849
4850 {
4851 // Set and Get work
4852 callBackContext cbInfo = {this, 0, 0, 0};
4853 const void *returnedContext;
4854 URegexMatchCallback *returnedFn;
4855 UErrorCode status = U_ZERO_ERROR;
4856 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4857 REGEX_CHECK_STATUS;
4858 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4859 REGEX_CHECK_STATUS;
4860 matcher.getMatchCallback(returnedFn, returnedContext, status);
4861 REGEX_CHECK_STATUS;
4862 REGEX_ASSERT(returnedFn == testCallBackFn);
4863 REGEX_ASSERT(returnedContext == &cbInfo);
4864
4865 // A short-running match shouldn't invoke the callback
4866 status = U_ZERO_ERROR;
4867 cbInfo.reset(1);
4868 UnicodeString s = "xxx";
4869 matcher.reset(s);
4870 REGEX_ASSERT(matcher.matches(status));
4871 REGEX_CHECK_STATUS;
4872 REGEX_ASSERT(cbInfo.numCalls == 0);
4873
4874 // A medium-length match that runs long enough to invoke the
4875 // callback, but not so long that the callback aborts it.
4876 status = U_ZERO_ERROR;
4877 cbInfo.reset(4);
4878 s = "aaaaaaaaaaaaaaaaaaab";
4879 matcher.reset(s);
4880 REGEX_ASSERT(matcher.matches(status)==FALSE);
4881 REGEX_CHECK_STATUS;
4882 REGEX_ASSERT(cbInfo.numCalls > 0);
4883
4884 // A longer running match that the callback function will abort.
4885 status = U_ZERO_ERROR;
4886 cbInfo.reset(4);
4887 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4888 matcher.reset(s);
4889 REGEX_ASSERT(matcher.matches(status)==FALSE);
4890 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4891 REGEX_ASSERT(cbInfo.numCalls == 4);
4892
4893 // A longer running find that the callback function will abort.
4894 status = U_ZERO_ERROR;
4895 cbInfo.reset(4);
4896 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4897 matcher.reset(s);
4898 REGEX_ASSERT(matcher.find(status)==FALSE);
4899 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4900 REGEX_ASSERT(cbInfo.numCalls == 4);
4901 }
4902
4903
4904 }
4905
4906
4907 //
4908 // FindProgressCallbacks() Test the find "progress" callback function.
4909 // When set, the find progress callback will be invoked during a find operations
4910 // after each return from a match attempt, giving the application the opportunity
4911 // to terminate a long-running find operation before it's normal completion.
4912 //
4913
4914 struct progressCallBackContext {
4915 RegexTest *test;
4916 int64_t lastIndex;
4917 int32_t maxCalls;
4918 int32_t numCalls;
4919 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4920 };
4921
4922 // call-back function for find().
4923 // Return TRUE to continue the find().
4924 // Return FALSE to stop the find().
4925 U_CDECL_BEGIN
4926 static UBool U_CALLCONV
4927 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4928 progressCallBackContext *info = (progressCallBackContext *)context;
4929 info->numCalls++;
4930 info->lastIndex = matchIndex;
4931 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4932 return (info->numCalls < info->maxCalls);
4933 }
4934 U_CDECL_END
4935
4936 void RegexTest::FindProgressCallbacks() {
4937 {
4938 // Getter returns NULLs if no callback has been set
4939
4940 // The variables that the getter will fill in.
4941 // Init to non-null values so that the action of the getter can be seen.
4942 const void *returnedContext = &returnedContext;
4943 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4944
4945 UErrorCode status = U_ZERO_ERROR;
4946 RegexMatcher matcher("x", 0, status);
4947 REGEX_CHECK_STATUS;
4948 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4949 REGEX_CHECK_STATUS;
4950 REGEX_ASSERT(returnedFn == NULL);
4951 REGEX_ASSERT(returnedContext == NULL);
4952 }
4953
4954 {
4955 // Set and Get work
4956 progressCallBackContext cbInfo = {this, 0, 0, 0};
4957 const void *returnedContext;
4958 URegexFindProgressCallback *returnedFn;
4959 UErrorCode status = U_ZERO_ERROR;
4960 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4961 REGEX_CHECK_STATUS;
4962 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4963 REGEX_CHECK_STATUS;
4964 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4965 REGEX_CHECK_STATUS;
4966 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4967 REGEX_ASSERT(returnedContext == &cbInfo);
4968
4969 // A find that matches on the initial position does NOT invoke the callback.
4970 status = U_ZERO_ERROR;
4971 cbInfo.reset(100);
4972 UnicodeString s = "aaxxx";
4973 matcher.reset(s);
4974 #if 0
4975 matcher.setTrace(TRUE);
4976 #endif
4977 REGEX_ASSERT(matcher.find(0, status));
4978 REGEX_CHECK_STATUS;
4979 REGEX_ASSERT(cbInfo.numCalls == 0);
4980
4981 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4982 // but not so many times that we interrupt the operation.
4983 status = U_ZERO_ERROR;
4984 s = "aaaaaaaaaaaaaaaaaaab";
4985 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4986 matcher.reset(s);
4987 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4988 REGEX_CHECK_STATUS;
4989 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4990
4991 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4992 status = U_ZERO_ERROR;
4993 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4994 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4995 matcher.reset(s1);
4996 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4997 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4998 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4999
5000 // Now a match that will succeed, but after an interruption
5001 status = U_ZERO_ERROR;
5002 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5003 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
5004 matcher.reset(s2);
5005 REGEX_ASSERT(matcher.find(0, status)==FALSE);
5006 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5007 // Now retry the match from where left off
5008 cbInfo.maxCalls = 100; // No callback limit
5009 status = U_ZERO_ERROR;
5010 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5011 REGEX_CHECK_STATUS;
5012 }
5013
5014
5015 }
5016
5017
5018 //---------------------------------------------------------------------------
5019 //
5020 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5021 // UTexts. The pure-C implementation of UText
5022 // has no mutable backing stores, but we can
5023 // use UnicodeString here to test the functionality.
5024 //
5025 //---------------------------------------------------------------------------
5026 void RegexTest::PreAllocatedUTextCAPI () {
5027 UErrorCode status = U_ZERO_ERROR;
5028 URegularExpression *re;
5029 UText patternText = UTEXT_INITIALIZER;
5030 UnicodeString buffer;
5031 UText bufferText = UTEXT_INITIALIZER;
5032
5033 utext_openUnicodeString(&bufferText, &buffer, &status);
5034
5035 /*
5036 * getText() and getUText()
5037 */
5038 {
5039 UText text1 = UTEXT_INITIALIZER;
5040 UText text2 = UTEXT_INITIALIZER;
5041 UChar text2Chars[20];
5042 UText *resultText;
5043
5044 status = U_ZERO_ERROR;
5045 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5046 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5047 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5048 utext_openUChars(&text2, text2Chars, -1, &status);
5049
5050 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5051 re = uregex_openUText(&patternText, 0, NULL, &status);
5052
5053 /* First set a UText */
5054 uregex_setUText(re, &text1, &status);
5055 resultText = uregex_getUText(re, &bufferText, &status);
5056 REGEX_CHECK_STATUS;
5057 REGEX_ASSERT(resultText == &bufferText);
5058 utext_setNativeIndex(resultText, 0);
5059 utext_setNativeIndex(&text1, 0);
5060 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5061
5062 resultText = uregex_getUText(re, &bufferText, &status);
5063 REGEX_CHECK_STATUS;
5064 REGEX_ASSERT(resultText == &bufferText);
5065 utext_setNativeIndex(resultText, 0);
5066 utext_setNativeIndex(&text1, 0);
5067 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5068
5069 /* Then set a UChar * */
5070 uregex_setText(re, text2Chars, 7, &status);
5071 resultText = uregex_getUText(re, &bufferText, &status);
5072 REGEX_CHECK_STATUS;
5073 REGEX_ASSERT(resultText == &bufferText);
5074 utext_setNativeIndex(resultText, 0);
5075 utext_setNativeIndex(&text2, 0);
5076 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5077
5078 uregex_close(re);
5079 utext_close(&text1);
5080 utext_close(&text2);
5081 }
5082
5083 /*
5084 * group()
5085 */
5086 {
5087 UChar text1[80];
5088 UText *actual;
5089 UBool result;
5090 int64_t length = 0;
5091
5092 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5093 // 012345678901234567890123456789012345678901234567
5094 // 0 1 2 3 4
5095
5096 status = U_ZERO_ERROR;
5097 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5098 REGEX_CHECK_STATUS;
5099
5100 uregex_setText(re, text1, -1, &status);
5101 result = uregex_find(re, 0, &status);
5102 REGEX_ASSERT(result==TRUE);
5103
5104 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5105 status = U_ZERO_ERROR;
5106 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5107 REGEX_CHECK_STATUS;
5108 REGEX_ASSERT(actual == &bufferText);
5109 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5110 REGEX_ASSERT(length == 16);
5111 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5112
5113 /* Capture group #1. Should succeed, matching " interior ". */
5114 status = U_ZERO_ERROR;
5115 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5116 REGEX_CHECK_STATUS;
5117 REGEX_ASSERT(actual == &bufferText);
5118 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5119 REGEX_ASSERT(length == 10);
5120 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5121
5122 /* Capture group out of range. Error. */
5123 status = U_ZERO_ERROR;
5124 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5125 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5126 REGEX_ASSERT(actual == &bufferText);
5127 uregex_close(re);
5128
5129 }
5130
5131 /*
5132 * replaceFirst()
5133 */
5134 {
5135 UChar text1[80];
5136 UChar text2[80];
5137 UText replText = UTEXT_INITIALIZER;
5138 UText *result;
5139 status = U_ZERO_ERROR;
5140 utext_openUnicodeString(&bufferText, &buffer, &status);
5141
5142 status = U_ZERO_ERROR;
5143 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5144 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5145 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5146
5147 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5148 REGEX_CHECK_STATUS;
5149
5150 /* Normal case, with match */
5151 uregex_setText(re, text1, -1, &status);
5152 REGEX_CHECK_STATUS;
5153 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5154 REGEX_CHECK_STATUS;
5155 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5156 REGEX_CHECK_STATUS;
5157 REGEX_ASSERT(result == &bufferText);
5158 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5159
5160 /* No match. Text should copy to output with no changes. */
5161 uregex_setText(re, text2, -1, &status);
5162 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5163 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5164 REGEX_CHECK_STATUS;
5165 REGEX_ASSERT(result == &bufferText);
5166 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5167
5168 /* Unicode escapes */
5169 uregex_setText(re, text1, -1, &status);
5170 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5171 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5172 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5173 REGEX_CHECK_STATUS;
5174 REGEX_ASSERT(result == &bufferText);
5175 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5176
5177 uregex_close(re);
5178 utext_close(&replText);
5179 }
5180
5181
5182 /*
5183 * replaceAll()
5184 */
5185 {
5186 UChar text1[80];
5187 UChar text2[80];
5188 UText replText = UTEXT_INITIALIZER;
5189 UText *result;
5190
5191 status = U_ZERO_ERROR;
5192 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5193 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5194 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5195
5196 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5197 REGEX_CHECK_STATUS;
5198
5199 /* Normal case, with match */
5200 uregex_setText(re, text1, -1, &status);
5201 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5202 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5203 REGEX_CHECK_STATUS;
5204 REGEX_ASSERT(result == &bufferText);
5205 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5206
5207 /* No match. Text should copy to output with no changes. */
5208 uregex_setText(re, text2, -1, &status);
5209 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5210 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5211 REGEX_CHECK_STATUS;
5212 REGEX_ASSERT(result == &bufferText);
5213 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5214
5215 uregex_close(re);
5216 utext_close(&replText);
5217 }
5218
5219
5220 /*
5221 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5222 * so we don't need to test it here.
5223 */
5224
5225 utext_close(&bufferText);
5226 utext_close(&patternText);
5227 }
5228
5229
5230 //--------------------------------------------------------------
5231 //
5232 // NamedCapture Check basic named capture group functionality
5233 //
5234 //--------------------------------------------------------------
5235 void RegexTest::NamedCapture() {
5236 UErrorCode status = U_ZERO_ERROR;
5237 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5238 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5239 REGEX_CHECK_STATUS;
5240 int32_t group = pat->groupNumberFromName("five", -1, status);
5241 REGEX_CHECK_STATUS;
5242 REGEX_ASSERT(5 == group);
5243 group = pat->groupNumberFromName("three", -1, status);
5244 REGEX_CHECK_STATUS;
5245 REGEX_ASSERT(3 == group);
5246
5247 status = U_ZERO_ERROR;
5248 group = pat->groupNumberFromName(UnicodeString("six"), status);
5249 REGEX_CHECK_STATUS;
5250 REGEX_ASSERT(6 == group);
5251
5252 status = U_ZERO_ERROR;
5253 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5254 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5255
5256 status = U_ZERO_ERROR;
5257
5258 // After copying a pattern, named capture should still work in the copy.
5259 RegexPattern *copiedPat = new RegexPattern(*pat);
5260 REGEX_ASSERT(*copiedPat == *pat);
5261 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5262
5263 group = copiedPat->groupNumberFromName("five", -1, status);
5264 REGEX_CHECK_STATUS;
5265 REGEX_ASSERT(5 == group);
5266 group = copiedPat->groupNumberFromName("three", -1, status);
5267 REGEX_CHECK_STATUS;
5268 REGEX_ASSERT(3 == group);
5269 delete copiedPat;
5270
5271 // ReplaceAll with named capture group.
5272 status = U_ZERO_ERROR;
5273 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5274 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5275 REGEX_CHECK_STATUS;
5276 // m.pattern().dumpPattern();
5277 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5278 REGEX_CHECK_STATUS;
5279 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5280 delete m;
5281
5282 // ReplaceAll, allowed capture group numbers.
5283 text = UnicodeString("abcmxyz");
5284 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5285 REGEX_CHECK_STATUS;
5286
5287 status = U_ZERO_ERROR;
5288 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5289 REGEX_CHECK_STATUS;
5290 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5291
5292 status = U_ZERO_ERROR;
5293 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5294 REGEX_CHECK_STATUS;
5295 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5296
5297 status = U_ZERO_ERROR;
5298 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5299 REGEX_CHECK_STATUS;
5300 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5301
5302 status = U_ZERO_ERROR;
5303 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5304 REGEX_CHECK_STATUS;
5305 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5306
5307 status = U_ZERO_ERROR;
5308 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5309 REGEX_CHECK_STATUS;
5310 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5311
5312 status = U_ZERO_ERROR;
5313 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5314 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5315
5316 status = U_ZERO_ERROR;
5317 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5318 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5319 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5320
5321 status = U_ZERO_ERROR;
5322 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5323 REGEX_CHECK_STATUS; // that push group num out of range.
5324 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5325
5326 status = U_ZERO_ERROR;
5327 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5328 REGEX_CHECK_STATUS;
5329 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5330
5331 status = U_ZERO_ERROR;
5332 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5333 REGEX_CHECK_STATUS;
5334 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5335
5336 status = U_ZERO_ERROR;
5337 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5338 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5339
5340 status = U_ZERO_ERROR;
5341 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5342 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5343
5344 status = U_ZERO_ERROR;
5345 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5346 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5347
5348 status = U_ZERO_ERROR;
5349 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5350 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5351
5352 delete m;
5353
5354 // Repeat the above replaceAll() tests using the plain C API, which
5355 // has a separate implementation internally.
5356 // TODO: factor out the test data.
5357
5358 status = U_ZERO_ERROR;
5359 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5360 REGEX_CHECK_STATUS;
5361 text = UnicodeString("abcmxyz");
5362 uregex_setText(re, text.getBuffer(), text.length(), &status);
5363 REGEX_CHECK_STATUS;
5364
5365 UChar resultBuf[100];
5366 int32_t resultLength;
5367 UnicodeString repl;
5368
5369 status = U_ZERO_ERROR;
5370 repl = UnicodeString("<$0>");
5371 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5372 REGEX_CHECK_STATUS;
5373 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5374
5375 status = U_ZERO_ERROR;
5376 repl = UnicodeString("<$1>");
5377 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5378 REGEX_CHECK_STATUS;
5379 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5380
5381 status = U_ZERO_ERROR;
5382 repl = UnicodeString("<${one}>");
5383 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5384 REGEX_CHECK_STATUS;
5385 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5386
5387 status = U_ZERO_ERROR;
5388 repl = UnicodeString("<$2>");
5389 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5390 REGEX_CHECK_STATUS;
5391 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5392
5393 status = U_ZERO_ERROR;
5394 repl = UnicodeString("<$3>");
5395 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5396 REGEX_CHECK_STATUS;
5397 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5398
5399 status = U_ZERO_ERROR;
5400 repl = UnicodeString("<$4>");
5401 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5402 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5403
5404 status = U_ZERO_ERROR;
5405 repl = UnicodeString("<$04>");
5406 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5407 REGEX_CHECK_STATUS;
5408 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5409
5410 status = U_ZERO_ERROR;
5411 repl = UnicodeString("<$000016>");
5412 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5413 REGEX_CHECK_STATUS;
5414 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5415
5416 status = U_ZERO_ERROR;
5417 repl = UnicodeString("<$3$2$1${one}>");
5418 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5419 REGEX_CHECK_STATUS;
5420 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5421
5422 status = U_ZERO_ERROR;
5423 repl = UnicodeString("$3$2$1${one}");
5424 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5425 REGEX_CHECK_STATUS;
5426 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5427
5428 status = U_ZERO_ERROR;
5429 repl = UnicodeString("<${noSuchName}>");
5430 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5431 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5432
5433 status = U_ZERO_ERROR;
5434 repl = UnicodeString("<${invalid-name}>");
5435 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5436 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5437
5438 status = U_ZERO_ERROR;
5439 repl = UnicodeString("<${one");
5440 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5441 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5442
5443 status = U_ZERO_ERROR;
5444 repl = UnicodeString("$not a capture group");
5445 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5446 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5447
5448 uregex_close(re);
5449 }
5450
5451 //--------------------------------------------------------------
5452 //
5453 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5454 // The point is not so much what the exact limit is,
5455 // but that a largish number doesn't hit bad non-linear performance,
5456 // and that exceeding the limit fails cleanly.
5457 //
5458 //--------------------------------------------------------------
5459 void RegexTest::NamedCaptureLimits() {
5460 if (quick) {
5461 logln("Skipping test. Runs in exhuastive mode only.");
5462 return;
5463 }
5464 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5465 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5466 char nnbuf[100];
5467 UnicodeString pattern;
5468 int32_t nn;
5469
5470 for (nn=1; nn<goodLimit; nn++) {
5471 sprintf(nnbuf, "(?<nn%d>)", nn);
5472 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5473 }
5474 UErrorCode status = U_ZERO_ERROR;
5475 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5476 REGEX_CHECK_STATUS;
5477 for (nn=1; nn<goodLimit; nn++) {
5478 sprintf(nnbuf, "nn%d", nn);
5479 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5480 REGEX_ASSERT(nn == groupNum);
5481 if (nn != groupNum) {
5482 break;
5483 }
5484 }
5485 delete pat;
5486
5487 pattern.remove();
5488 for (nn=1; nn<failLimit; nn++) {
5489 sprintf(nnbuf, "(?<nn%d>)", nn);
5490 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5491 }
5492 status = U_ZERO_ERROR;
5493 pat = RegexPattern::compile(pattern, 0, status);
5494 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5495 delete pat;
5496 }
5497
5498
5499 //--------------------------------------------------------------
5500 //
5501 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5502 //
5503 //---------------------------------------------------------------
5504 void RegexTest::Bug7651() {
5505 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5506 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5507 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5508 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5509 UnicodeString s("#ff @abcd This is test");
5510 RegexPattern *REPattern = NULL;
5511 RegexMatcher *REMatcher = NULL;
5512 UErrorCode status = U_ZERO_ERROR;
5513 UParseError pe;
5514
5515 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5516 REGEX_CHECK_STATUS;
5517 REMatcher = REPattern->matcher(s, status);
5518 REGEX_CHECK_STATUS;
5519 REGEX_ASSERT(REMatcher->find());
5520 REGEX_ASSERT(REMatcher->start(status) == 0);
5521 delete REPattern;
5522 delete REMatcher;
5523 status = U_ZERO_ERROR;
5524
5525 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5526 REGEX_CHECK_STATUS;
5527 REMatcher = REPattern->matcher(s, status);
5528 REGEX_CHECK_STATUS;
5529 REGEX_ASSERT(REMatcher->find());
5530 REGEX_ASSERT(REMatcher->start(status) == 0);
5531 delete REPattern;
5532 delete REMatcher;
5533 status = U_ZERO_ERROR;
5534 }
5535
5536 void RegexTest::Bug7740() {
5537 UErrorCode status = U_ZERO_ERROR;
5538 UnicodeString pattern = "(a)";
5539 UnicodeString text = "abcdef";
5540 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5541 REGEX_CHECK_STATUS;
5542 REGEX_ASSERT(m->lookingAt(status));
5543 REGEX_CHECK_STATUS;
5544 status = U_ILLEGAL_ARGUMENT_ERROR;
5545 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5546 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5547 REGEX_ASSERT(s == "");
5548 delete m;
5549 }
5550
5551 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5552
5553 void RegexTest::Bug8479() {
5554 UErrorCode status = U_ZERO_ERROR;
5555
5556 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5557 REGEX_CHECK_STATUS;
5558 if (U_SUCCESS(status))
5559 {
5560 UnicodeString str;
5561 str.setToBogus();
5562 pMatcher->reset(str);
5563 status = U_ZERO_ERROR;
5564 pMatcher->matches(status);
5565 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5566 delete pMatcher;
5567 }
5568 }
5569
5570
5571 // Bug 7029
5572 void RegexTest::Bug7029() {
5573 UErrorCode status = U_ZERO_ERROR;
5574
5575 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5576 UnicodeString text = "abc.def";
5577 UnicodeString splits[10];
5578 REGEX_CHECK_STATUS;
5579 int32_t numFields = pMatcher->split(text, splits, 10, status);
5580 REGEX_CHECK_STATUS;
5581 REGEX_ASSERT(numFields == 8);
5582 delete pMatcher;
5583 }
5584
5585 // Bug 9283
5586 // This test is checking for the existance of any supplemental characters that case-fold
5587 // to a bmp character.
5588 //
5589 // At the time of this writing there are none. If any should appear in a subsequent release
5590 // of Unicode, the code in regular expressions compilation that determines the longest
5591 // posssible match for a literal string will need to be enhanced.
5592 //
5593 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5594 // for details on what to do in case of a failure of this test.
5595 //
5596 void RegexTest::Bug9283() {
5597 #if !UCONFIG_NO_NORMALIZATION
5598 UErrorCode status = U_ZERO_ERROR;
5599 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5600 REGEX_CHECK_STATUS;
5601 int32_t index;
5602 UChar32 c;
5603 for (index=0; ; index++) {
5604 c = supplementalsWithCaseFolding.charAt(index);
5605 if (c == -1) {
5606 break;
5607 }
5608 UnicodeString cf = UnicodeString(c).foldCase();
5609 REGEX_ASSERT(cf.length() >= 2);
5610 }
5611 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5612 }
5613
5614
5615 void RegexTest::CheckInvBufSize() {
5616 if(inv_next>=INV_BUFSIZ) {
5617 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5618 __FILE__, INV_BUFSIZ, inv_next);
5619 } else {
5620 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5621 }
5622 }
5623
5624
5625 void RegexTest::Bug10459() {
5626 UErrorCode status = U_ZERO_ERROR;
5627 UnicodeString patternString("(txt)");
5628 UnicodeString txtString("txt");
5629
5630 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5631 REGEX_CHECK_STATUS;
5632 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5633 REGEX_CHECK_STATUS;
5634
5635 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5636 REGEX_CHECK_STATUS;
5637
5638 uregex_setUText(icu_re, utext_txt, &status);
5639 REGEX_CHECK_STATUS;
5640
5641 // The bug was that calling uregex_group() before doing a matching operation
5642 // was causing a segfault. Only for Regular Expressions created from UText.
5643 // It should set an U_REGEX_INVALID_STATE.
5644
5645 UChar buf[100];
5646 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5647 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5648 REGEX_ASSERT(len == 0);
5649
5650 uregex_close(icu_re);
5651 utext_close(utext_pat);
5652 utext_close(utext_txt);
5653 }
5654
5655 void RegexTest::TestCaseInsensitiveStarters() {
5656 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5657 // become stale because of new Unicode characters.
5658 // If it is stale, rerun the generation tool
5659 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5660 // and replace the embedded data in i18n/regexcmp.cpp
5661
5662 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5663 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5664 continue;
5665 }
5666 UnicodeSet s(cp, cp);
5667 s.closeOver(USET_CASE_INSENSITIVE);
5668 UnicodeSetIterator setIter(s);
5669 while (setIter.next()) {
5670 if (!setIter.isString()) {
5671 continue;
5672 }
5673 const UnicodeString &str = setIter.getString();
5674 UChar32 firstChar = str.char32At(0);
5675 UnicodeSet starters;
5676 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5677 if (!starters.contains(cp)) {
5678 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5679 return;
5680 }
5681 }
5682 }
5683 }
5684
5685
5686 void RegexTest::TestBug11049() {
5687 // Original bug report: pattern with match start consisting of one of several individual characters,
5688 // and the text being matched ending with a supplementary character. find() would read past the
5689 // end of the input text when searching for potential match starting points.
5690
5691 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5692 // detect the bad read.
5693
5694 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5695 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5696
5697 // Test again with a pattern starting with a single character,
5698 // which takes a different code path than starting with an OR expression,
5699 // but with similar logic.
5700 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5701 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5702 }
5703
5704 // Run a single test case from TestBug11049(). Internal function.
5705 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5706 UErrorCode status = U_ZERO_ERROR;
5707 UnicodeString patternString = UnicodeString(pattern).unescape();
5708 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5709
5710 UnicodeString dataString = UnicodeString(data).unescape();
5711 UChar *exactBuffer = new UChar[dataString.length()];
5712 dataString.extract(exactBuffer, dataString.length(), status);
5713 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5714
5715 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5716 REGEX_CHECK_STATUS;
5717 matcher->reset(ut);
5718 UBool result = matcher->find();
5719 if (result != expectMatch) {
5720 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5721 __FILE__, lineNumber, expectMatch, result, pattern, data);
5722 }
5723
5724 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5725 // off-by-one on find() with match at the last code point.
5726 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5727 // because string.unescape() will only shrink it.
5728 char * utf8Buffer = new char[uprv_strlen(data)+1];
5729 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5730 REGEX_CHECK_STATUS;
5731 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5732 REGEX_CHECK_STATUS;
5733 matcher->reset(ut);
5734 result = matcher->find();
5735 if (result != expectMatch) {
5736 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5737 __FILE__, lineNumber, expectMatch, result, pattern, data);
5738 }
5739 delete [] utf8Buffer;
5740
5741 utext_close(ut);
5742 delete [] exactBuffer;
5743 }
5744
5745
5746 void RegexTest::TestBug11371() {
5747 if (quick) {
5748 logln("Skipping test. Runs in exhuastive mode only.");
5749 return;
5750 }
5751 UErrorCode status = U_ZERO_ERROR;
5752 UnicodeString patternString;
5753
5754 for (int i=0; i<8000000; i++) {
5755 patternString.append(UnicodeString("()"));
5756 }
5757 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5758 if (status != U_REGEX_PATTERN_TOO_BIG) {
5759 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5760 __FILE__, __LINE__, u_errorName(status));
5761 }
5762
5763 status = U_ZERO_ERROR;
5764 patternString = "(";
5765 for (int i=0; i<20000000; i++) {
5766 patternString.append(UnicodeString("A++"));
5767 }
5768 patternString.append(UnicodeString("){0}B++"));
5769 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5770 if (status != U_REGEX_PATTERN_TOO_BIG) {
5771 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5772 __FILE__, __LINE__, u_errorName(status));
5773 }
5774
5775 // Pattern with too much string data, such that string indexes overflow operand data field size
5776 // in compiled instruction.
5777 status = U_ZERO_ERROR;
5778 patternString = "";
5779 while (patternString.length() < 0x00ffffff) {
5780 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5781 }
5782 patternString.append(UnicodeString("X? trailing string"));
5783 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5784 if (status != U_REGEX_PATTERN_TOO_BIG) {
5785 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5786 __FILE__, __LINE__, u_errorName(status));
5787 }
5788 }
5789
5790 void RegexTest::TestBug11480() {
5791 // C API, get capture group of a group that does not participate in the match.
5792 // (Returns a zero length string, with nul termination,
5793 // indistinguishable from a group with a zero length match.)
5794
5795 UErrorCode status = U_ZERO_ERROR;
5796 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5797 REGEX_CHECK_STATUS;
5798 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5799 uregex_setText(re, text.getBuffer(), text.length(), &status);
5800 REGEX_CHECK_STATUS;
5801 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5802 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5803 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5804 REGEX_ASSERT(length == 0);
5805 REGEX_ASSERT(buf[0] == 13);
5806 REGEX_ASSERT(buf[1] == 0);
5807 REGEX_ASSERT(buf[2] == 13);
5808 uregex_close(re);
5809
5810 // UText C++ API, length of match is 0 for non-participating matches.
5811 UText ut = UTEXT_INITIALIZER;
5812 utext_openUnicodeString(&ut, &text, &status);
5813 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5814 REGEX_CHECK_STATUS;
5815 matcher.reset(&ut);
5816 REGEX_ASSERT(matcher.lookingAt(0, status));
5817
5818 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5819 int64_t groupLen = -666;
5820 UText group = UTEXT_INITIALIZER;
5821 matcher.group(1, &group, groupLen, status);
5822 REGEX_CHECK_STATUS;
5823 REGEX_ASSERT(groupLen == 1);
5824 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5825
5826 // Capture group 2, the (B), does not participate in the match.
5827 matcher.group(2, &group, groupLen, status);
5828 REGEX_CHECK_STATUS;
5829 REGEX_ASSERT(groupLen == 0);
5830 REGEX_ASSERT(matcher.start(2, status) == -1);
5831 REGEX_CHECK_STATUS;
5832 }
5833
5834
5835 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */