]> git.saurik.com Git - apple/icu.git/blob - icuSources/test/intltest/regextst.cpp
ICU-64260.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50
51 #define SUPPORT_MUTATING_INPUT_STRING 0
52
53 //---------------------------------------------------------------------------
54 //
55 // Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
58 RegexTest::RegexTest()
59 {
60 }
61
62
63 RegexTest::~RegexTest()
64 {
65 }
66
67
68
69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71 if (exec) logln("TestSuite RegexTest: ");
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(Basic);
74 TESTCASE_AUTO(API_Match);
75 TESTCASE_AUTO(API_Replace);
76 TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended);
79 #endif
80 TESTCASE_AUTO(Errors);
81 TESTCASE_AUTO(PerlTests);
82 TESTCASE_AUTO(Callbacks);
83 TESTCASE_AUTO(FindProgressCallbacks);
84 TESTCASE_AUTO(Bug6149);
85 TESTCASE_AUTO(UTextBasic);
86 TESTCASE_AUTO(API_Match_UTF8);
87 TESTCASE_AUTO(API_Replace_UTF8);
88 TESTCASE_AUTO(API_Pattern_UTF8);
89 TESTCASE_AUTO(PerlTestsUTF8);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI);
91 TESTCASE_AUTO(Bug7651);
92 TESTCASE_AUTO(Bug7740);
93 TESTCASE_AUTO(Bug8479);
94 TESTCASE_AUTO(Bug7029);
95 TESTCASE_AUTO(CheckInvBufSize);
96 TESTCASE_AUTO(Bug9283);
97 TESTCASE_AUTO(Bug10459);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters);
99 TESTCASE_AUTO(TestBug11049);
100 TESTCASE_AUTO(TestBug11371);
101 TESTCASE_AUTO(TestBug11480);
102 TESTCASE_AUTO(NamedCapture);
103 TESTCASE_AUTO(NamedCaptureLimits);
104 TESTCASE_AUTO(TestBug12884);
105 TESTCASE_AUTO(TestBug13631);
106 TESTCASE_AUTO(TestBug13632);
107 TESTCASE_AUTO(TestBug20359);
108 TESTCASE_AUTO_END;
109 }
110
111
112 /**
113 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
114 * into ASCII.
115 * @see utext_openUTF8
116 */
117 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
118
119 //---------------------------------------------------------------------------
120 //
121 // Error Checking / Reporting macros used in all of the tests.
122 //
123 //---------------------------------------------------------------------------
124
125 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
126 int64_t oldIndex = utext_getNativeIndex(text);
127 utext_setNativeIndex(text, 0);
128 char *bufPtr = buf;
129 UChar32 c = utext_next32From(text, 0);
130 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
131 if (0x000020<=c && c<0x00007e) {
132 *bufPtr = c;
133 } else {
134 #if 0
135 sprintf(bufPtr,"U+%04X", c);
136 bufPtr+= strlen(bufPtr)-1;
137 #else
138 *bufPtr = '%';
139 #endif
140 }
141 bufPtr++;
142 c = UTEXT_NEXT32(text);
143 }
144 *bufPtr = 0;
145 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
146 char *ebuf = (char*)malloc(bufLen);
147 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
148 uprv_strncpy(buf, ebuf, bufLen);
149 free((void*)ebuf);
150 #endif
151 utext_setNativeIndex(text, oldIndex);
152 }
153
154
155 static char ASSERT_BUF[1024];
156
157 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
158 if(message.length()==0) {
159 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
160 } else {
161 UnicodeString buf;
162 IntlTest::prettify(message,buf);
163 if(buf.length()==0) {
164 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
165 } else {
166 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
167 if(ASSERT_BUF[0]==0) {
168 ASSERT_BUF[0]=0;
169 for(int32_t i=0;i<buf.length();i++) {
170 UChar ch = buf[i];
171 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
172 }
173 }
174 }
175 }
176 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
177 return ASSERT_BUF;
178 }
179
180 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
181
182 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
183 __FILE__, __LINE__, u_errorName(status)); return;}}
184
185 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
186
187 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
188 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
189 __LINE__, u_errorName(errcode), u_errorName(status));};}
190
191 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
192 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
193
194 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
195 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
196
197 // expected: const char * , restricted to invariant characters.
198 // actual: const UnicodeString &
199 #define REGEX_ASSERT_UNISTR(expected, actual) { \
200 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
201 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
202 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
203
204
205 static UBool testUTextEqual(UText *uta, UText *utb) {
206 UChar32 ca = 0;
207 UChar32 cb = 0;
208 utext_setNativeIndex(uta, 0);
209 utext_setNativeIndex(utb, 0);
210 do {
211 ca = utext_next32(uta);
212 cb = utext_next32(utb);
213 if (ca != cb) {
214 break;
215 }
216 } while (ca != U_SENTINEL);
217 return ca == cb;
218 }
219
220
221 /**
222 * @param expected expected text in UTF-8 (not platform) codepage
223 */
224 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
225 UErrorCode status = U_ZERO_ERROR;
226 UText expectedText = UTEXT_INITIALIZER;
227 utext_openUTF8(&expectedText, expected, -1, &status);
228 if(U_FAILURE(status)) {
229 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
230 return;
231 }
232 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
233 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
234 return;
235 }
236 utext_setNativeIndex(actual, 0);
237 if (!testUTextEqual(&expectedText, actual)) {
238 char buf[201 /*21*/];
239 char expectedBuf[201];
240 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
241 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
242 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
243 }
244 utext_close(&expectedText);
245 }
246 /**
247 * @param expected invariant (platform local text) input
248 */
249
250 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
251 UErrorCode status = U_ZERO_ERROR;
252 UText expectedText = UTEXT_INITIALIZER;
253 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
254 if(U_FAILURE(status)) {
255 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
256 return;
257 }
258 utext_setNativeIndex(actual, 0);
259 if (!testUTextEqual(&expectedText, actual)) {
260 char buf[201 /*21*/];
261 char expectedBuf[201];
262 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
263 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
264 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
265 }
266 utext_close(&expectedText);
267 }
268
269 /**
270 * Assumes utf-8 input
271 */
272 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
273 /**
274 * Assumes Invariant input
275 */
276 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
277
278 /**
279 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
280 * passed into utext_openUTF8. An error will be given if
281 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
282 */
283
284 #define INV_BUFSIZ 2048 /* increase this if too small */
285
286 static int64_t inv_next=0;
287
288 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
289 static char inv_buf[INV_BUFSIZ];
290 #endif
291
292 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
293 if(length==-1) length=strlen(inv);
294 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
295 inv_next+=length;
296 return utext_openUTF8(ut, inv, length, status);
297 #else
298 if(inv_next+length+1>INV_BUFSIZ) {
299 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
300 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
301 *status = U_MEMORY_ALLOCATION_ERROR;
302 return NULL;
303 }
304
305 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
306 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
307 inv_next+=length;
308
309 #if 0
310 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
311 #endif
312
313 return utext_openUTF8(ut, (const char*)buf, length, status);
314 #endif
315 }
316
317
318 //---------------------------------------------------------------------------
319 //
320 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
321 // for the LookingAt() and Match() functions.
322 //
323 // usage:
324 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
325 //
326 // The expected results are UBool - TRUE or FALSE.
327 // The input text is unescaped. The pattern is not.
328 //
329 //
330 //---------------------------------------------------------------------------
331
332 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
333
334 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
335 const UnicodeString pattern(pat, -1, US_INV);
336 const UnicodeString inputText(text, -1, US_INV);
337 UErrorCode status = U_ZERO_ERROR;
338 UParseError pe;
339 RegexPattern *REPattern = NULL;
340 RegexMatcher *REMatcher = NULL;
341 UBool retVal = TRUE;
342
343 UnicodeString patString(pat, -1, US_INV);
344 REPattern = RegexPattern::compile(patString, 0, pe, status);
345 if (U_FAILURE(status)) {
346 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
347 line, u_errorName(status));
348 return FALSE;
349 }
350 if (line==376) { REPattern->dumpPattern();}
351
352 UnicodeString inputString(inputText);
353 UnicodeString unEscapedInput = inputString.unescape();
354 REMatcher = REPattern->matcher(unEscapedInput, status);
355 if (U_FAILURE(status)) {
356 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
357 line, u_errorName(status));
358 return FALSE;
359 }
360
361 UBool actualmatch;
362 actualmatch = REMatcher->lookingAt(status);
363 if (U_FAILURE(status)) {
364 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
365 line, u_errorName(status));
366 retVal = FALSE;
367 }
368 if (actualmatch != looking) {
369 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
370 retVal = FALSE;
371 }
372
373 status = U_ZERO_ERROR;
374 actualmatch = REMatcher->matches(status);
375 if (U_FAILURE(status)) {
376 errln("RegexTest failure in matches() at line %d. Status = %s\n",
377 line, u_errorName(status));
378 retVal = FALSE;
379 }
380 if (actualmatch != match) {
381 errln("RegexTest: wrong return from matches() at line %d.\n", line);
382 retVal = FALSE;
383 }
384
385 if (retVal == FALSE) {
386 REPattern->dumpPattern();
387 }
388
389 delete REPattern;
390 delete REMatcher;
391 return retVal;
392 }
393
394
395 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
396 UText pattern = UTEXT_INITIALIZER;
397 int32_t inputUTF8Length;
398 char *textChars = NULL;
399 UText inputText = UTEXT_INITIALIZER;
400 UErrorCode status = U_ZERO_ERROR;
401 UParseError pe;
402 RegexPattern *REPattern = NULL;
403 RegexMatcher *REMatcher = NULL;
404 UBool retVal = TRUE;
405
406 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
407 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
408 if (U_FAILURE(status)) {
409 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
410 line, u_errorName(status));
411 return FALSE;
412 }
413
414 UnicodeString inputString(text, -1, US_INV);
415 UnicodeString unEscapedInput = inputString.unescape();
416 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
417 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
418
419 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
420 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
421 // UTF-8 does not allow unpaired surrogates, so this could actually happen
422 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
423 return TRUE; // not a failure of the Regex engine
424 }
425 status = U_ZERO_ERROR; // buffer overflow
426 textChars = new char[inputUTF8Length+1];
427 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
428 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
429
430 REMatcher = &REPattern->matcher(status)->reset(&inputText);
431 if (U_FAILURE(status)) {
432 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
433 line, u_errorName(status));
434 return FALSE;
435 }
436
437 UBool actualmatch;
438 actualmatch = REMatcher->lookingAt(status);
439 if (U_FAILURE(status)) {
440 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
441 line, u_errorName(status));
442 retVal = FALSE;
443 }
444 if (actualmatch != looking) {
445 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
446 retVal = FALSE;
447 }
448
449 status = U_ZERO_ERROR;
450 actualmatch = REMatcher->matches(status);
451 if (U_FAILURE(status)) {
452 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
453 line, u_errorName(status));
454 retVal = FALSE;
455 }
456 if (actualmatch != match) {
457 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
458 retVal = FALSE;
459 }
460
461 if (retVal == FALSE) {
462 REPattern->dumpPattern();
463 }
464
465 delete REPattern;
466 delete REMatcher;
467 utext_close(&inputText);
468 utext_close(&pattern);
469 delete[] textChars;
470 return retVal;
471 }
472
473
474
475 //---------------------------------------------------------------------------
476 //
477 // REGEX_ERR Macro + invocation function to simplify writing tests
478 // regex tests for incorrect patterns
479 //
480 // usage:
481 // REGEX_ERR("pattern", expected error line, column, expected status);
482 //
483 //---------------------------------------------------------------------------
484 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
485
486 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
487 UErrorCode expectedStatus, int32_t line) {
488 UnicodeString pattern(pat);
489
490 UErrorCode status = U_ZERO_ERROR;
491 UParseError pe;
492 RegexPattern *callerPattern = NULL;
493
494 //
495 // Compile the caller's pattern
496 //
497 UnicodeString patString(pat);
498 callerPattern = RegexPattern::compile(patString, 0, pe, status);
499 if (status != expectedStatus) {
500 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
501 } else {
502 if (status != U_ZERO_ERROR) {
503 if (pe.line != errLine || pe.offset != errCol) {
504 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
505 line, errLine, errCol, pe.line, pe.offset);
506 }
507 }
508 }
509
510 delete callerPattern;
511
512 //
513 // Compile again, using a UTF-8-based UText
514 //
515 UText patternText = UTEXT_INITIALIZER;
516 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
517 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
518 if (status != expectedStatus) {
519 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
520 } else {
521 if (status != U_ZERO_ERROR) {
522 if (pe.line != errLine || pe.offset != errCol) {
523 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
524 line, errLine, errCol, pe.line, pe.offset);
525 }
526 }
527 }
528
529 delete callerPattern;
530 utext_close(&patternText);
531 }
532
533
534
535 //---------------------------------------------------------------------------
536 //
537 // Basic Check for basic functionality of regex pattern matching.
538 // Avoid the use of REGEX_FIND test macro, which has
539 // substantial dependencies on basic Regex functionality.
540 //
541 //---------------------------------------------------------------------------
542 void RegexTest::Basic() {
543
544
545 //
546 // Debug - slide failing test cases early
547 //
548 #if 0
549 {
550 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
551 UParseError pe;
552 UErrorCode status = U_ZERO_ERROR;
553 RegexPattern *pattern;
554 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
555 pattern->dumpPattern();
556 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
557 UBool result = m->find();
558 printf("result = %d\n", result);
559 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
560 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
561 }
562 exit(1);
563 #endif
564
565
566 //
567 // Pattern with parentheses
568 //
569 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
570 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
571 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
572
573 //
574 // Patterns with *
575 //
576 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
577 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
578 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
579 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
580 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
581
582 REGEX_TESTLM("a*", "", TRUE, TRUE);
583 REGEX_TESTLM("a*", "b", TRUE, FALSE);
584
585
586 //
587 // Patterns with "."
588 //
589 REGEX_TESTLM(".", "abc", TRUE, FALSE);
590 REGEX_TESTLM("...", "abc", TRUE, TRUE);
591 REGEX_TESTLM("....", "abc", FALSE, FALSE);
592 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
593 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
594 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
595 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
596 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
597
598 //
599 // Patterns with * applied to chars at end of literal string
600 //
601 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
602 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
603
604 //
605 // Supplemental chars match as single chars, not a pair of surrogates.
606 //
607 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
608 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
609 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
610
611
612 //
613 // UnicodeSets in the pattern
614 //
615 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
616 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
617 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
618 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
619 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
620 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
621
622 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
623 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
624 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
625 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
626 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
627
628 //
629 // OR operator in patterns
630 //
631 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
632 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
633 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
634 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
635
636 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
637 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
638 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
639 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
640 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
641 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
642
643 //
644 // +
645 //
646 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
647 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
648 REGEX_TESTLM("b+", "", FALSE, FALSE);
649 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
650 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
651 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
652
653 //
654 // ?
655 //
656 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
657 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
658 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
659 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
660 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
661 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
662 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
663 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
664 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
665
666 //
667 // Escape sequences that become single literal chars, handled internally
668 // by ICU's Unescape.
669 //
670
671 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
672 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
673 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
674 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
675 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
676 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
677 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
678 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
679 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
680 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
681
682 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
683 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
684
685 // Escape of special chars in patterns
686 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
687 }
688
689
690 //---------------------------------------------------------------------------
691 //
692 // UTextBasic Check for quirks that are specific to the UText
693 // implementation.
694 //
695 //---------------------------------------------------------------------------
696 void RegexTest::UTextBasic() {
697 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
698 UErrorCode status = U_ZERO_ERROR;
699 UText pattern = UTEXT_INITIALIZER;
700 utext_openUTF8(&pattern, str_abc, -1, &status);
701 RegexMatcher matcher(&pattern, 0, status);
702 REGEX_CHECK_STATUS;
703
704 UText input = UTEXT_INITIALIZER;
705 utext_openUTF8(&input, str_abc, -1, &status);
706 REGEX_CHECK_STATUS;
707 matcher.reset(&input);
708 REGEX_CHECK_STATUS;
709 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
710
711 matcher.reset(matcher.inputText());
712 REGEX_CHECK_STATUS;
713 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
714
715 utext_close(&pattern);
716 utext_close(&input);
717 }
718
719
720 //---------------------------------------------------------------------------
721 //
722 // API_Match Test that the API for class RegexMatcher
723 // is present and nominally working, but excluding functions
724 // implementing replace operations.
725 //
726 //---------------------------------------------------------------------------
727 void RegexTest::API_Match() {
728 UParseError pe;
729 UErrorCode status=U_ZERO_ERROR;
730 int32_t flags = 0;
731
732 //
733 // Debug - slide failing test cases early
734 //
735 #if 0
736 {
737 }
738 return;
739 #endif
740
741 //
742 // Simple pattern compilation
743 //
744 {
745 UnicodeString re("abc");
746 RegexPattern *pat2;
747 pat2 = RegexPattern::compile(re, flags, pe, status);
748 REGEX_CHECK_STATUS;
749
750 UnicodeString inStr1 = "abcdef this is a test";
751 UnicodeString instr2 = "not abc";
752 UnicodeString empty = "";
753
754
755 //
756 // Matcher creation and reset.
757 //
758 RegexMatcher *m1 = pat2->matcher(inStr1, status);
759 REGEX_CHECK_STATUS;
760 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
761 REGEX_ASSERT(m1->input() == inStr1);
762 m1->reset(instr2);
763 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
764 REGEX_ASSERT(m1->input() == instr2);
765 m1->reset(inStr1);
766 REGEX_ASSERT(m1->input() == inStr1);
767 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
768 m1->reset(empty);
769 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
770 REGEX_ASSERT(m1->input() == empty);
771 REGEX_ASSERT(&m1->pattern() == pat2);
772
773 //
774 // reset(pos, status)
775 //
776 m1->reset(inStr1);
777 m1->reset(4, status);
778 REGEX_CHECK_STATUS;
779 REGEX_ASSERT(m1->input() == inStr1);
780 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
781
782 m1->reset(-1, status);
783 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
784 status = U_ZERO_ERROR;
785
786 m1->reset(0, status);
787 REGEX_CHECK_STATUS;
788 status = U_ZERO_ERROR;
789
790 int32_t len = m1->input().length();
791 m1->reset(len-1, status);
792 REGEX_CHECK_STATUS;
793 status = U_ZERO_ERROR;
794
795 m1->reset(len, status);
796 REGEX_CHECK_STATUS;
797 status = U_ZERO_ERROR;
798
799 m1->reset(len+1, status);
800 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
801 status = U_ZERO_ERROR;
802
803 //
804 // match(pos, status)
805 //
806 m1->reset(instr2);
807 REGEX_ASSERT(m1->matches(4, status) == TRUE);
808 m1->reset();
809 REGEX_ASSERT(m1->matches(3, status) == FALSE);
810 m1->reset();
811 REGEX_ASSERT(m1->matches(5, status) == FALSE);
812 REGEX_ASSERT(m1->matches(4, status) == TRUE);
813 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
815
816 // Match() at end of string should fail, but should not
817 // be an error.
818 status = U_ZERO_ERROR;
819 len = m1->input().length();
820 REGEX_ASSERT(m1->matches(len, status) == FALSE);
821 REGEX_CHECK_STATUS;
822
823 // Match beyond end of string should fail with an error.
824 status = U_ZERO_ERROR;
825 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
826 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
827
828 // Successful match at end of string.
829 {
830 status = U_ZERO_ERROR;
831 RegexMatcher m("A?", 0, status); // will match zero length string.
832 REGEX_CHECK_STATUS;
833 m.reset(inStr1);
834 len = inStr1.length();
835 REGEX_ASSERT(m.matches(len, status) == TRUE);
836 REGEX_CHECK_STATUS;
837 m.reset(empty);
838 REGEX_ASSERT(m.matches(0, status) == TRUE);
839 REGEX_CHECK_STATUS;
840 }
841
842
843 //
844 // lookingAt(pos, status)
845 //
846 status = U_ZERO_ERROR;
847 m1->reset(instr2); // "not abc"
848 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
849 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
850 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
851 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
852 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
853 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
854 status = U_ZERO_ERROR;
855 len = m1->input().length();
856 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
857 REGEX_CHECK_STATUS;
858 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
859 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
860
861 delete m1;
862 delete pat2;
863 }
864
865
866 //
867 // Capture Group.
868 // RegexMatcher::start();
869 // RegexMatcher::end();
870 // RegexMatcher::groupCount();
871 //
872 {
873 int32_t flags=0;
874 UParseError pe;
875 UErrorCode status=U_ZERO_ERROR;
876
877 UnicodeString re("01(23(45)67)(.*)");
878 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
879 REGEX_CHECK_STATUS;
880 UnicodeString data = "0123456789";
881
882 RegexMatcher *matcher = pat->matcher(data, status);
883 REGEX_CHECK_STATUS;
884 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
885 static const int32_t matchStarts[] = {0, 2, 4, 8};
886 static const int32_t matchEnds[] = {10, 8, 6, 10};
887 int32_t i;
888 for (i=0; i<4; i++) {
889 int32_t actualStart = matcher->start(i, status);
890 REGEX_CHECK_STATUS;
891 if (actualStart != matchStarts[i]) {
892 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
893 __LINE__, i, matchStarts[i], actualStart);
894 }
895 int32_t actualEnd = matcher->end(i, status);
896 REGEX_CHECK_STATUS;
897 if (actualEnd != matchEnds[i]) {
898 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
899 __LINE__, i, matchEnds[i], actualEnd);
900 }
901 }
902
903 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
904 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
905
906 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
907 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
908 matcher->reset();
909 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
910
911 matcher->lookingAt(status);
912 REGEX_ASSERT(matcher->group(status) == "0123456789");
913 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
914 REGEX_ASSERT(matcher->group(1, status) == "234567" );
915 REGEX_ASSERT(matcher->group(2, status) == "45" );
916 REGEX_ASSERT(matcher->group(3, status) == "89" );
917 REGEX_CHECK_STATUS;
918 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
919 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
920 matcher->reset();
921 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
922
923 delete matcher;
924 delete pat;
925
926 }
927
928 //
929 // find
930 //
931 {
932 int32_t flags=0;
933 UParseError pe;
934 UErrorCode status=U_ZERO_ERROR;
935
936 UnicodeString re("abc");
937 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
938 REGEX_CHECK_STATUS;
939 UnicodeString data = ".abc..abc...abc..";
940 // 012345678901234567
941
942 RegexMatcher *matcher = pat->matcher(data, status);
943 REGEX_CHECK_STATUS;
944 REGEX_ASSERT(matcher->find());
945 REGEX_ASSERT(matcher->start(status) == 1);
946 REGEX_ASSERT(matcher->find());
947 REGEX_ASSERT(matcher->start(status) == 6);
948 REGEX_ASSERT(matcher->find());
949 REGEX_ASSERT(matcher->start(status) == 12);
950 REGEX_ASSERT(matcher->find() == FALSE);
951 REGEX_ASSERT(matcher->find() == FALSE);
952
953 matcher->reset();
954 REGEX_ASSERT(matcher->find());
955 REGEX_ASSERT(matcher->start(status) == 1);
956
957 REGEX_ASSERT(matcher->find(0, status));
958 REGEX_ASSERT(matcher->start(status) == 1);
959 REGEX_ASSERT(matcher->find(1, status));
960 REGEX_ASSERT(matcher->start(status) == 1);
961 REGEX_ASSERT(matcher->find(2, status));
962 REGEX_ASSERT(matcher->start(status) == 6);
963 REGEX_ASSERT(matcher->find(12, status));
964 REGEX_ASSERT(matcher->start(status) == 12);
965 REGEX_ASSERT(matcher->find(13, status) == FALSE);
966 REGEX_ASSERT(matcher->find(16, status) == FALSE);
967 REGEX_ASSERT(matcher->find(17, status) == FALSE);
968 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
969
970 status = U_ZERO_ERROR;
971 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
972 status = U_ZERO_ERROR;
973 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
974
975 REGEX_ASSERT(matcher->groupCount() == 0);
976
977 delete matcher;
978 delete pat;
979 }
980
981
982 //
983 // find, with \G in pattern (true if at the end of a previous match).
984 //
985 {
986 int32_t flags=0;
987 UParseError pe;
988 UErrorCode status=U_ZERO_ERROR;
989
990 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
991 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
992 REGEX_CHECK_STATUS;
993 UnicodeString data = ".abcabc.abc..";
994 // 012345678901234567
995
996 RegexMatcher *matcher = pat->matcher(data, status);
997 REGEX_CHECK_STATUS;
998 REGEX_ASSERT(matcher->find());
999 REGEX_ASSERT(matcher->start(status) == 0);
1000 REGEX_ASSERT(matcher->start(1, status) == -1);
1001 REGEX_ASSERT(matcher->start(2, status) == 1);
1002
1003 REGEX_ASSERT(matcher->find());
1004 REGEX_ASSERT(matcher->start(status) == 4);
1005 REGEX_ASSERT(matcher->start(1, status) == 4);
1006 REGEX_ASSERT(matcher->start(2, status) == -1);
1007 REGEX_CHECK_STATUS;
1008
1009 delete matcher;
1010 delete pat;
1011 }
1012
1013 //
1014 // find with zero length matches, match position should bump ahead
1015 // to prevent loops.
1016 //
1017 {
1018 int32_t i;
1019 UErrorCode status=U_ZERO_ERROR;
1020 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1021 // using an always-true look-ahead.
1022 REGEX_CHECK_STATUS;
1023 UnicodeString s(" ");
1024 m.reset(s);
1025 for (i=0; ; i++) {
1026 if (m.find() == FALSE) {
1027 break;
1028 }
1029 REGEX_ASSERT(m.start(status) == i);
1030 REGEX_ASSERT(m.end(status) == i);
1031 }
1032 REGEX_ASSERT(i==5);
1033
1034 // Check that the bump goes over surrogate pairs OK
1035 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1036 s = s.unescape();
1037 m.reset(s);
1038 for (i=0; ; i+=2) {
1039 if (m.find() == FALSE) {
1040 break;
1041 }
1042 REGEX_ASSERT(m.start(status) == i);
1043 REGEX_ASSERT(m.end(status) == i);
1044 }
1045 REGEX_ASSERT(i==10);
1046 }
1047 {
1048 // find() loop breaking test.
1049 // with pattern of /.?/, should see a series of one char matches, then a single
1050 // match of zero length at the end of the input string.
1051 int32_t i;
1052 UErrorCode status=U_ZERO_ERROR;
1053 RegexMatcher m(".?", 0, status);
1054 REGEX_CHECK_STATUS;
1055 UnicodeString s(" ");
1056 m.reset(s);
1057 for (i=0; ; i++) {
1058 if (m.find() == FALSE) {
1059 break;
1060 }
1061 REGEX_ASSERT(m.start(status) == i);
1062 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1063 }
1064 REGEX_ASSERT(i==5);
1065 }
1066
1067
1068 //
1069 // Matchers with no input string behave as if they had an empty input string.
1070 //
1071
1072 {
1073 UErrorCode status = U_ZERO_ERROR;
1074 RegexMatcher m(".?", 0, status);
1075 REGEX_CHECK_STATUS;
1076 REGEX_ASSERT(m.find());
1077 REGEX_ASSERT(m.start(status) == 0);
1078 REGEX_ASSERT(m.input() == "");
1079 }
1080 {
1081 UErrorCode status = U_ZERO_ERROR;
1082 RegexPattern *p = RegexPattern::compile(".", 0, status);
1083 RegexMatcher *m = p->matcher(status);
1084 REGEX_CHECK_STATUS;
1085
1086 REGEX_ASSERT(m->find() == FALSE);
1087 REGEX_ASSERT(m->input() == "");
1088 delete m;
1089 delete p;
1090 }
1091
1092 //
1093 // Regions
1094 //
1095 {
1096 UErrorCode status = U_ZERO_ERROR;
1097 UnicodeString testString("This is test data");
1098 RegexMatcher m(".*", testString, 0, status);
1099 REGEX_CHECK_STATUS;
1100 REGEX_ASSERT(m.regionStart() == 0);
1101 REGEX_ASSERT(m.regionEnd() == testString.length());
1102 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1103 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1104
1105 m.region(2,4, status);
1106 REGEX_CHECK_STATUS;
1107 REGEX_ASSERT(m.matches(status));
1108 REGEX_ASSERT(m.start(status)==2);
1109 REGEX_ASSERT(m.end(status)==4);
1110 REGEX_CHECK_STATUS;
1111
1112 m.reset();
1113 REGEX_ASSERT(m.regionStart() == 0);
1114 REGEX_ASSERT(m.regionEnd() == testString.length());
1115
1116 UnicodeString shorterString("short");
1117 m.reset(shorterString);
1118 REGEX_ASSERT(m.regionStart() == 0);
1119 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1120
1121 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1122 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1123 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1124 REGEX_ASSERT(&m == &m.reset());
1125 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1126
1127 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1128 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1129 REGEX_ASSERT(&m == &m.reset());
1130 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1131
1132 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1133 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1134 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1135 REGEX_ASSERT(&m == &m.reset());
1136 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1137
1138 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1139 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1140 REGEX_ASSERT(&m == &m.reset());
1141 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1142
1143 }
1144
1145 //
1146 // hitEnd() and requireEnd()
1147 //
1148 {
1149 UErrorCode status = U_ZERO_ERROR;
1150 UnicodeString testString("aabb");
1151 RegexMatcher m1(".*", testString, 0, status);
1152 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1153 REGEX_ASSERT(m1.hitEnd() == TRUE);
1154 REGEX_ASSERT(m1.requireEnd() == FALSE);
1155 REGEX_CHECK_STATUS;
1156
1157 status = U_ZERO_ERROR;
1158 RegexMatcher m2("a*", testString, 0, status);
1159 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1160 REGEX_ASSERT(m2.hitEnd() == FALSE);
1161 REGEX_ASSERT(m2.requireEnd() == FALSE);
1162 REGEX_CHECK_STATUS;
1163
1164 status = U_ZERO_ERROR;
1165 RegexMatcher m3(".*$", testString, 0, status);
1166 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1167 REGEX_ASSERT(m3.hitEnd() == TRUE);
1168 REGEX_ASSERT(m3.requireEnd() == TRUE);
1169 REGEX_CHECK_STATUS;
1170 }
1171
1172
1173 //
1174 // Compilation error on reset with UChar *
1175 // These were a hazard that people were stumbling over with runtime errors.
1176 // Changed them to compiler errors by adding private methods that more closely
1177 // matched the incorrect use of the functions.
1178 //
1179 #if 0
1180 {
1181 UErrorCode status = U_ZERO_ERROR;
1182 UChar ucharString[20];
1183 RegexMatcher m(".", 0, status);
1184 m.reset(ucharString); // should not compile.
1185
1186 RegexPattern *p = RegexPattern::compile(".", 0, status);
1187 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1188
1189 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1190 }
1191 #endif
1192
1193 //
1194 // Time Outs.
1195 // Note: These tests will need to be changed when the regexp engine is
1196 // able to detect and cut short the exponential time behavior on
1197 // this type of match.
1198 //
1199 {
1200 UErrorCode status = U_ZERO_ERROR;
1201 // Enough 'a's in the string to cause the match to time out.
1202 // (Each on additonal 'a' doubles the time)
1203 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1204 RegexMatcher matcher("(a+)+b", testString, 0, status);
1205 REGEX_CHECK_STATUS;
1206 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1207 matcher.setTimeLimit(100, status);
1208 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1209 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1210 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1211 }
1212 {
1213 UErrorCode status = U_ZERO_ERROR;
1214 // Few enough 'a's to slip in under the time limit.
1215 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1216 RegexMatcher matcher("(a+)+b", testString, 0, status);
1217 REGEX_CHECK_STATUS;
1218 matcher.setTimeLimit(100, status);
1219 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1220 REGEX_CHECK_STATUS;
1221 }
1222
1223 //
1224 // Stack Limits
1225 //
1226 {
1227 UErrorCode status = U_ZERO_ERROR;
1228 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1229
1230 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1231 // of the '+', and makes the stack frames larger.
1232 RegexMatcher matcher("(A)+A$", testString, 0, status);
1233
1234 // With the default stack, this match should fail to run
1235 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1237
1238 // With unlimited stack, it should run
1239 status = U_ZERO_ERROR;
1240 matcher.setStackLimit(0, status);
1241 REGEX_CHECK_STATUS;
1242 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1243 REGEX_CHECK_STATUS;
1244 REGEX_ASSERT(matcher.getStackLimit() == 0);
1245
1246 // With a limited stack, it the match should fail
1247 status = U_ZERO_ERROR;
1248 matcher.setStackLimit(10000, status);
1249 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1250 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1251 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1252 }
1253
1254 // A pattern that doesn't save state should work with
1255 // a minimal sized stack
1256 {
1257 UErrorCode status = U_ZERO_ERROR;
1258 UnicodeString testString = "abc";
1259 RegexMatcher matcher("abc", testString, 0, status);
1260 REGEX_CHECK_STATUS;
1261 matcher.setStackLimit(30, status);
1262 REGEX_CHECK_STATUS;
1263 REGEX_ASSERT(matcher.matches(status) == TRUE);
1264 REGEX_CHECK_STATUS;
1265 REGEX_ASSERT(matcher.getStackLimit() == 30);
1266
1267 // Negative stack sizes should fail
1268 status = U_ZERO_ERROR;
1269 matcher.setStackLimit(1000, status);
1270 REGEX_CHECK_STATUS;
1271 matcher.setStackLimit(-1, status);
1272 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1273 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1274 }
1275
1276
1277 }
1278
1279
1280
1281
1282
1283
1284 //---------------------------------------------------------------------------
1285 //
1286 // API_Replace API test for class RegexMatcher, testing the
1287 // Replace family of functions.
1288 //
1289 //---------------------------------------------------------------------------
1290 void RegexTest::API_Replace() {
1291 //
1292 // Replace
1293 //
1294 int32_t flags=0;
1295 UParseError pe;
1296 UErrorCode status=U_ZERO_ERROR;
1297
1298 UnicodeString re("abc");
1299 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1300 REGEX_CHECK_STATUS;
1301 UnicodeString data = ".abc..abc...abc..";
1302 // 012345678901234567
1303 RegexMatcher *matcher = pat->matcher(data, status);
1304
1305 //
1306 // Plain vanilla matches.
1307 //
1308 UnicodeString dest;
1309 dest = matcher->replaceFirst("yz", status);
1310 REGEX_CHECK_STATUS;
1311 REGEX_ASSERT(dest == ".yz..abc...abc..");
1312
1313 dest = matcher->replaceAll("yz", status);
1314 REGEX_CHECK_STATUS;
1315 REGEX_ASSERT(dest == ".yz..yz...yz..");
1316
1317 //
1318 // Plain vanilla non-matches.
1319 //
1320 UnicodeString d2 = ".abx..abx...abx..";
1321 matcher->reset(d2);
1322 dest = matcher->replaceFirst("yz", status);
1323 REGEX_CHECK_STATUS;
1324 REGEX_ASSERT(dest == ".abx..abx...abx..");
1325
1326 dest = matcher->replaceAll("yz", status);
1327 REGEX_CHECK_STATUS;
1328 REGEX_ASSERT(dest == ".abx..abx...abx..");
1329
1330 //
1331 // Empty source string
1332 //
1333 UnicodeString d3 = "";
1334 matcher->reset(d3);
1335 dest = matcher->replaceFirst("yz", status);
1336 REGEX_CHECK_STATUS;
1337 REGEX_ASSERT(dest == "");
1338
1339 dest = matcher->replaceAll("yz", status);
1340 REGEX_CHECK_STATUS;
1341 REGEX_ASSERT(dest == "");
1342
1343 //
1344 // Empty substitution string
1345 //
1346 matcher->reset(data); // ".abc..abc...abc.."
1347 dest = matcher->replaceFirst("", status);
1348 REGEX_CHECK_STATUS;
1349 REGEX_ASSERT(dest == "...abc...abc..");
1350
1351 dest = matcher->replaceAll("", status);
1352 REGEX_CHECK_STATUS;
1353 REGEX_ASSERT(dest == "........");
1354
1355 //
1356 // match whole string
1357 //
1358 UnicodeString d4 = "abc";
1359 matcher->reset(d4);
1360 dest = matcher->replaceFirst("xyz", status);
1361 REGEX_CHECK_STATUS;
1362 REGEX_ASSERT(dest == "xyz");
1363
1364 dest = matcher->replaceAll("xyz", status);
1365 REGEX_CHECK_STATUS;
1366 REGEX_ASSERT(dest == "xyz");
1367
1368 //
1369 // Capture Group, simple case
1370 //
1371 UnicodeString re2("a(..)");
1372 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1373 REGEX_CHECK_STATUS;
1374 UnicodeString d5 = "abcdefg";
1375 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1376 REGEX_CHECK_STATUS;
1377 dest = matcher2->replaceFirst("$1$1", status);
1378 REGEX_CHECK_STATUS;
1379 REGEX_ASSERT(dest == "bcbcdefg");
1380
1381 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1382 REGEX_CHECK_STATUS;
1383 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1384
1385 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1386 REGEX_ASSERT(U_FAILURE(status));
1387 status = U_ZERO_ERROR;
1388
1389 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1390 replacement = replacement.unescape();
1391 dest = matcher2->replaceFirst(replacement, status);
1392 REGEX_CHECK_STATUS;
1393 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1394
1395 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1396
1397
1398 //
1399 // Replacement String with \u hex escapes
1400 //
1401 {
1402 UnicodeString src = "abc 1 abc 2 abc 3";
1403 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1404 matcher->reset(src);
1405 UnicodeString result = matcher->replaceAll(substitute, status);
1406 REGEX_CHECK_STATUS;
1407 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1408 }
1409 {
1410 UnicodeString src = "abc !";
1411 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1412 matcher->reset(src);
1413 UnicodeString result = matcher->replaceAll(substitute, status);
1414 REGEX_CHECK_STATUS;
1415 UnicodeString expected = UnicodeString("--");
1416 expected.append((UChar32)0x10000);
1417 expected.append("-- !");
1418 REGEX_ASSERT(result == expected);
1419 }
1420 // TODO: need more through testing of capture substitutions.
1421
1422 // Bug 4057
1423 //
1424 {
1425 status = U_ZERO_ERROR;
1426 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1427 RegexMatcher m("ss(.*?)ee", 0, status);
1428 REGEX_CHECK_STATUS;
1429 UnicodeString result;
1430
1431 // Multiple finds do NOT bump up the previous appendReplacement postion.
1432 m.reset(s);
1433 m.find();
1434 m.find();
1435 m.appendReplacement(result, "ooh", status);
1436 REGEX_CHECK_STATUS;
1437 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1438
1439 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1440 status = U_ZERO_ERROR;
1441 result.truncate(0);
1442 m.reset(10, status);
1443 m.find();
1444 m.find();
1445 m.appendReplacement(result, "ooh", status);
1446 REGEX_CHECK_STATUS;
1447 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1448
1449 // find() at interior of string, appendReplacemnt still starts at beginning.
1450 status = U_ZERO_ERROR;
1451 result.truncate(0);
1452 m.reset();
1453 m.find(10, status);
1454 m.find();
1455 m.appendReplacement(result, "ooh", status);
1456 REGEX_CHECK_STATUS;
1457 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1458
1459 m.appendTail(result);
1460 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1461
1462 }
1463
1464 delete matcher2;
1465 delete pat2;
1466 delete matcher;
1467 delete pat;
1468 }
1469
1470
1471 //---------------------------------------------------------------------------
1472 //
1473 // API_Pattern Test that the API for class RegexPattern is
1474 // present and nominally working.
1475 //
1476 //---------------------------------------------------------------------------
1477 void RegexTest::API_Pattern() {
1478 RegexPattern pata; // Test default constructor to not crash.
1479 RegexPattern patb;
1480
1481 REGEX_ASSERT(pata == patb);
1482 REGEX_ASSERT(pata == pata);
1483
1484 UnicodeString re1("abc[a-l][m-z]");
1485 UnicodeString re2("def");
1486 UErrorCode status = U_ZERO_ERROR;
1487 UParseError pe;
1488
1489 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1490 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1491 REGEX_CHECK_STATUS;
1492 REGEX_ASSERT(*pat1 == *pat1);
1493 REGEX_ASSERT(*pat1 != pata);
1494
1495 // Assign
1496 patb = *pat1;
1497 REGEX_ASSERT(patb == *pat1);
1498
1499 // Copy Construct
1500 RegexPattern patc(*pat1);
1501 REGEX_ASSERT(patc == *pat1);
1502 REGEX_ASSERT(patb == patc);
1503 REGEX_ASSERT(pat1 != pat2);
1504 patb = *pat2;
1505 REGEX_ASSERT(patb != patc);
1506 REGEX_ASSERT(patb == *pat2);
1507
1508 // Compile with no flags.
1509 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1510 REGEX_ASSERT(*pat1a == *pat1);
1511
1512 REGEX_ASSERT(pat1a->flags() == 0);
1513
1514 // Compile with different flags should be not equal
1515 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1516 REGEX_CHECK_STATUS;
1517
1518 REGEX_ASSERT(*pat1b != *pat1a);
1519 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1520 REGEX_ASSERT(pat1a->flags() == 0);
1521 delete pat1b;
1522
1523 // clone
1524 RegexPattern *pat1c = pat1->clone();
1525 REGEX_ASSERT(*pat1c == *pat1);
1526 REGEX_ASSERT(*pat1c != *pat2);
1527
1528 delete pat1c;
1529 delete pat1a;
1530 delete pat1;
1531 delete pat2;
1532
1533
1534 //
1535 // Verify that a matcher created from a cloned pattern works.
1536 // (Jitterbug 3423)
1537 //
1538 {
1539 UErrorCode status = U_ZERO_ERROR;
1540 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1541 RegexPattern *pClone = pSource->clone();
1542 delete pSource;
1543 RegexMatcher *mFromClone = pClone->matcher(status);
1544 REGEX_CHECK_STATUS;
1545 UnicodeString s = "Hello World";
1546 mFromClone->reset(s);
1547 REGEX_ASSERT(mFromClone->find() == TRUE);
1548 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1549 REGEX_ASSERT(mFromClone->find() == TRUE);
1550 REGEX_ASSERT(mFromClone->group(status) == "World");
1551 REGEX_ASSERT(mFromClone->find() == FALSE);
1552 delete mFromClone;
1553 delete pClone;
1554 }
1555
1556 //
1557 // matches convenience API
1558 //
1559 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1560 REGEX_CHECK_STATUS;
1561 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1562 REGEX_CHECK_STATUS;
1563 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1564 REGEX_CHECK_STATUS;
1565 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1566 REGEX_CHECK_STATUS;
1567 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1568 REGEX_CHECK_STATUS;
1569 status = U_INDEX_OUTOFBOUNDS_ERROR;
1570 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1571 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1572
1573
1574 //
1575 // Split()
1576 //
1577 status = U_ZERO_ERROR;
1578 pat1 = RegexPattern::compile(" +", pe, status);
1579 REGEX_CHECK_STATUS;
1580 UnicodeString fields[10];
1581
1582 int32_t n;
1583 n = pat1->split("Now is the time", fields, 10, status);
1584 REGEX_CHECK_STATUS;
1585 REGEX_ASSERT(n==4);
1586 REGEX_ASSERT(fields[0]=="Now");
1587 REGEX_ASSERT(fields[1]=="is");
1588 REGEX_ASSERT(fields[2]=="the");
1589 REGEX_ASSERT(fields[3]=="time");
1590 REGEX_ASSERT(fields[4]=="");
1591
1592 n = pat1->split("Now is the time", fields, 2, status);
1593 REGEX_CHECK_STATUS;
1594 REGEX_ASSERT(n==2);
1595 REGEX_ASSERT(fields[0]=="Now");
1596 REGEX_ASSERT(fields[1]=="is the time");
1597 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1598
1599 fields[1] = "*";
1600 status = U_ZERO_ERROR;
1601 n = pat1->split("Now is the time", fields, 1, status);
1602 REGEX_CHECK_STATUS;
1603 REGEX_ASSERT(n==1);
1604 REGEX_ASSERT(fields[0]=="Now is the time");
1605 REGEX_ASSERT(fields[1]=="*");
1606 status = U_ZERO_ERROR;
1607
1608 n = pat1->split(" Now is the time ", fields, 10, status);
1609 REGEX_CHECK_STATUS;
1610 REGEX_ASSERT(n==6);
1611 REGEX_ASSERT(fields[0]=="");
1612 REGEX_ASSERT(fields[1]=="Now");
1613 REGEX_ASSERT(fields[2]=="is");
1614 REGEX_ASSERT(fields[3]=="the");
1615 REGEX_ASSERT(fields[4]=="time");
1616 REGEX_ASSERT(fields[5]=="");
1617
1618 n = pat1->split(" ", fields, 10, status);
1619 REGEX_CHECK_STATUS;
1620 REGEX_ASSERT(n==2);
1621 REGEX_ASSERT(fields[0]=="");
1622 REGEX_ASSERT(fields[1]=="");
1623
1624 fields[0] = "foo";
1625 n = pat1->split("", fields, 10, status);
1626 REGEX_CHECK_STATUS;
1627 REGEX_ASSERT(n==0);
1628 REGEX_ASSERT(fields[0]=="foo");
1629
1630 delete pat1;
1631
1632 // split, with a pattern with (capture)
1633 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1634 REGEX_CHECK_STATUS;
1635
1636 status = U_ZERO_ERROR;
1637 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1638 REGEX_CHECK_STATUS;
1639 REGEX_ASSERT(n==7);
1640 REGEX_ASSERT(fields[0]=="");
1641 REGEX_ASSERT(fields[1]=="a");
1642 REGEX_ASSERT(fields[2]=="Now is ");
1643 REGEX_ASSERT(fields[3]=="b");
1644 REGEX_ASSERT(fields[4]=="the time");
1645 REGEX_ASSERT(fields[5]=="c");
1646 REGEX_ASSERT(fields[6]=="");
1647 REGEX_ASSERT(status==U_ZERO_ERROR);
1648
1649 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1650 REGEX_CHECK_STATUS;
1651 REGEX_ASSERT(n==7);
1652 REGEX_ASSERT(fields[0]==" ");
1653 REGEX_ASSERT(fields[1]=="a");
1654 REGEX_ASSERT(fields[2]=="Now is ");
1655 REGEX_ASSERT(fields[3]=="b");
1656 REGEX_ASSERT(fields[4]=="the time");
1657 REGEX_ASSERT(fields[5]=="c");
1658 REGEX_ASSERT(fields[6]=="");
1659
1660 status = U_ZERO_ERROR;
1661 fields[6] = "foo";
1662 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1663 REGEX_CHECK_STATUS;
1664 REGEX_ASSERT(n==6);
1665 REGEX_ASSERT(fields[0]==" ");
1666 REGEX_ASSERT(fields[1]=="a");
1667 REGEX_ASSERT(fields[2]=="Now is ");
1668 REGEX_ASSERT(fields[3]=="b");
1669 REGEX_ASSERT(fields[4]=="the time");
1670 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1671 REGEX_ASSERT(fields[6]=="foo");
1672
1673 status = U_ZERO_ERROR;
1674 fields[5] = "foo";
1675 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1676 REGEX_CHECK_STATUS;
1677 REGEX_ASSERT(n==5);
1678 REGEX_ASSERT(fields[0]==" ");
1679 REGEX_ASSERT(fields[1]=="a");
1680 REGEX_ASSERT(fields[2]=="Now is ");
1681 REGEX_ASSERT(fields[3]=="b");
1682 REGEX_ASSERT(fields[4]=="the time<c>");
1683 REGEX_ASSERT(fields[5]=="foo");
1684
1685 status = U_ZERO_ERROR;
1686 fields[5] = "foo";
1687 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1688 REGEX_CHECK_STATUS;
1689 REGEX_ASSERT(n==5);
1690 REGEX_ASSERT(fields[0]==" ");
1691 REGEX_ASSERT(fields[1]=="a");
1692 REGEX_ASSERT(fields[2]=="Now is ");
1693 REGEX_ASSERT(fields[3]=="b");
1694 REGEX_ASSERT(fields[4]=="the time");
1695 REGEX_ASSERT(fields[5]=="foo");
1696
1697 status = U_ZERO_ERROR;
1698 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1699 REGEX_CHECK_STATUS;
1700 REGEX_ASSERT(n==4);
1701 REGEX_ASSERT(fields[0]==" ");
1702 REGEX_ASSERT(fields[1]=="a");
1703 REGEX_ASSERT(fields[2]=="Now is ");
1704 REGEX_ASSERT(fields[3]=="the time<c>");
1705 status = U_ZERO_ERROR;
1706 delete pat1;
1707
1708 pat1 = RegexPattern::compile("([-,])", pe, status);
1709 REGEX_CHECK_STATUS;
1710 n = pat1->split("1-10,20", fields, 10, status);
1711 REGEX_CHECK_STATUS;
1712 REGEX_ASSERT(n==5);
1713 REGEX_ASSERT(fields[0]=="1");
1714 REGEX_ASSERT(fields[1]=="-");
1715 REGEX_ASSERT(fields[2]=="10");
1716 REGEX_ASSERT(fields[3]==",");
1717 REGEX_ASSERT(fields[4]=="20");
1718 delete pat1;
1719
1720 // Test split of string with empty trailing fields
1721 pat1 = RegexPattern::compile(",", pe, status);
1722 REGEX_CHECK_STATUS;
1723 n = pat1->split("a,b,c,", fields, 10, status);
1724 REGEX_CHECK_STATUS;
1725 REGEX_ASSERT(n==4);
1726 REGEX_ASSERT(fields[0]=="a");
1727 REGEX_ASSERT(fields[1]=="b");
1728 REGEX_ASSERT(fields[2]=="c");
1729 REGEX_ASSERT(fields[3]=="");
1730
1731 n = pat1->split("a,,,", fields, 10, status);
1732 REGEX_CHECK_STATUS;
1733 REGEX_ASSERT(n==4);
1734 REGEX_ASSERT(fields[0]=="a");
1735 REGEX_ASSERT(fields[1]=="");
1736 REGEX_ASSERT(fields[2]=="");
1737 REGEX_ASSERT(fields[3]=="");
1738 delete pat1;
1739
1740 // Split Separator with zero length match.
1741 pat1 = RegexPattern::compile(":?", pe, status);
1742 REGEX_CHECK_STATUS;
1743 n = pat1->split("abc", fields, 10, status);
1744 REGEX_CHECK_STATUS;
1745 REGEX_ASSERT(n==5);
1746 REGEX_ASSERT(fields[0]=="");
1747 REGEX_ASSERT(fields[1]=="a");
1748 REGEX_ASSERT(fields[2]=="b");
1749 REGEX_ASSERT(fields[3]=="c");
1750 REGEX_ASSERT(fields[4]=="");
1751
1752 delete pat1;
1753
1754 //
1755 // RegexPattern::pattern()
1756 //
1757 pat1 = new RegexPattern();
1758 REGEX_ASSERT(pat1->pattern() == "");
1759 delete pat1;
1760
1761 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1762 REGEX_CHECK_STATUS;
1763 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1764 delete pat1;
1765
1766
1767 //
1768 // classID functions
1769 //
1770 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1771 REGEX_CHECK_STATUS;
1772 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1773 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1774 UnicodeString Hello("Hello, world.");
1775 RegexMatcher *m = pat1->matcher(Hello, status);
1776 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1777 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1778 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1779 delete m;
1780 delete pat1;
1781
1782 }
1783
1784 //---------------------------------------------------------------------------
1785 //
1786 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1787 // is present and working, but excluding functions
1788 // implementing replace operations.
1789 //
1790 //---------------------------------------------------------------------------
1791 void RegexTest::API_Match_UTF8() {
1792 UParseError pe;
1793 UErrorCode status=U_ZERO_ERROR;
1794 int32_t flags = 0;
1795
1796 //
1797 // Debug - slide failing test cases early
1798 //
1799 #if 0
1800 {
1801 }
1802 return;
1803 #endif
1804
1805 //
1806 // Simple pattern compilation
1807 //
1808 {
1809 UText re = UTEXT_INITIALIZER;
1810 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1811 REGEX_VERBOSE_TEXT(&re);
1812 RegexPattern *pat2;
1813 pat2 = RegexPattern::compile(&re, flags, pe, status);
1814 REGEX_CHECK_STATUS;
1815
1816 UText input1 = UTEXT_INITIALIZER;
1817 UText input2 = UTEXT_INITIALIZER;
1818 UText empty = UTEXT_INITIALIZER;
1819 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1820 REGEX_VERBOSE_TEXT(&input1);
1821 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1822 REGEX_VERBOSE_TEXT(&input2);
1823 utext_openUChars(&empty, NULL, 0, &status);
1824
1825 int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1826 int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1827
1828
1829 //
1830 // Matcher creation and reset.
1831 //
1832 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1833 REGEX_CHECK_STATUS;
1834 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1835 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1836 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1837 m1->reset(&input2);
1838 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1839 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1840 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1841 m1->reset(&input1);
1842 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1843 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1844 m1->reset(&empty);
1845 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1846 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1847
1848 //
1849 // reset(pos, status)
1850 //
1851 m1->reset(&input1);
1852 m1->reset(4, status);
1853 REGEX_CHECK_STATUS;
1854 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1855 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1856
1857 m1->reset(-1, status);
1858 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1859 status = U_ZERO_ERROR;
1860
1861 m1->reset(0, status);
1862 REGEX_CHECK_STATUS;
1863 status = U_ZERO_ERROR;
1864
1865 m1->reset(input1Len-1, status);
1866 REGEX_CHECK_STATUS;
1867 status = U_ZERO_ERROR;
1868
1869 m1->reset(input1Len, status);
1870 REGEX_CHECK_STATUS;
1871 status = U_ZERO_ERROR;
1872
1873 m1->reset(input1Len+1, status);
1874 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1875 status = U_ZERO_ERROR;
1876
1877 //
1878 // match(pos, status)
1879 //
1880 m1->reset(&input2);
1881 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1882 m1->reset();
1883 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1884 m1->reset();
1885 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1886 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1887 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1888 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1889
1890 // Match() at end of string should fail, but should not
1891 // be an error.
1892 status = U_ZERO_ERROR;
1893 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1894 REGEX_CHECK_STATUS;
1895
1896 // Match beyond end of string should fail with an error.
1897 status = U_ZERO_ERROR;
1898 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1899 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1900
1901 // Successful match at end of string.
1902 {
1903 status = U_ZERO_ERROR;
1904 RegexMatcher m("A?", 0, status); // will match zero length string.
1905 REGEX_CHECK_STATUS;
1906 m.reset(&input1);
1907 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1908 REGEX_CHECK_STATUS;
1909 m.reset(&empty);
1910 REGEX_ASSERT(m.matches(0, status) == TRUE);
1911 REGEX_CHECK_STATUS;
1912 }
1913
1914
1915 //
1916 // lookingAt(pos, status)
1917 //
1918 status = U_ZERO_ERROR;
1919 m1->reset(&input2); // "not abc"
1920 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1921 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1922 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1923 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1924 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1925 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926 status = U_ZERO_ERROR;
1927 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1928 REGEX_CHECK_STATUS;
1929 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1930 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932 delete m1;
1933 delete pat2;
1934
1935 utext_close(&re);
1936 utext_close(&input1);
1937 utext_close(&input2);
1938 utext_close(&empty);
1939 }
1940
1941
1942 //
1943 // Capture Group.
1944 // RegexMatcher::start();
1945 // RegexMatcher::end();
1946 // RegexMatcher::groupCount();
1947 //
1948 {
1949 int32_t flags=0;
1950 UParseError pe;
1951 UErrorCode status=U_ZERO_ERROR;
1952 UText re=UTEXT_INITIALIZER;
1953 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1954 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1955
1956 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1957 REGEX_CHECK_STATUS;
1958
1959 UText input = UTEXT_INITIALIZER;
1960 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1961 utext_openUTF8(&input, str_0123456789, -1, &status);
1962
1963 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1964 REGEX_CHECK_STATUS;
1965 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1966 static const int32_t matchStarts[] = {0, 2, 4, 8};
1967 static const int32_t matchEnds[] = {10, 8, 6, 10};
1968 int32_t i;
1969 for (i=0; i<4; i++) {
1970 int32_t actualStart = matcher->start(i, status);
1971 REGEX_CHECK_STATUS;
1972 if (actualStart != matchStarts[i]) {
1973 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1974 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1975 }
1976 int32_t actualEnd = matcher->end(i, status);
1977 REGEX_CHECK_STATUS;
1978 if (actualEnd != matchEnds[i]) {
1979 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1980 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1981 }
1982 }
1983
1984 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1985 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1986
1987 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1988 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1989 matcher->reset();
1990 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1991
1992 matcher->lookingAt(status);
1993
1994 UnicodeString dest;
1995 UText destText = UTEXT_INITIALIZER;
1996 utext_openUnicodeString(&destText, &dest, &status);
1997 UText *result;
1998 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1999 // Test shallow-clone API
2000 int64_t group_len;
2001 result = matcher->group((UText *)NULL, group_len, status);
2002 REGEX_CHECK_STATUS;
2003 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2004 utext_close(result);
2005 result = matcher->group(0, &destText, group_len, status);
2006 REGEX_CHECK_STATUS;
2007 REGEX_ASSERT(result == &destText);
2008 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2009 // destText is now immutable, reopen it
2010 utext_close(&destText);
2011 utext_openUnicodeString(&destText, &dest, &status);
2012
2013 int64_t length;
2014 result = matcher->group(0, NULL, length, status);
2015 REGEX_CHECK_STATUS;
2016 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2017 utext_close(result);
2018 result = matcher->group(0, &destText, length, status);
2019 REGEX_CHECK_STATUS;
2020 REGEX_ASSERT(result == &destText);
2021 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2022 REGEX_ASSERT(length == 10);
2023 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2024
2025 // Capture Group 1 == "234567"
2026 result = matcher->group(1, NULL, length, status);
2027 REGEX_CHECK_STATUS;
2028 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2029 REGEX_ASSERT(length == 6);
2030 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2031 utext_close(result);
2032
2033 result = matcher->group(1, &destText, length, status);
2034 REGEX_CHECK_STATUS;
2035 REGEX_ASSERT(result == &destText);
2036 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2037 REGEX_ASSERT(length == 6);
2038 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2039 utext_close(result);
2040
2041 // Capture Group 2 == "45"
2042 result = matcher->group(2, NULL, length, status);
2043 REGEX_CHECK_STATUS;
2044 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2045 REGEX_ASSERT(length == 2);
2046 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2047 utext_close(result);
2048
2049 result = matcher->group(2, &destText, length, status);
2050 REGEX_CHECK_STATUS;
2051 REGEX_ASSERT(result == &destText);
2052 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2053 REGEX_ASSERT(length == 2);
2054 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2055 utext_close(result);
2056
2057 // Capture Group 3 == "89"
2058 result = matcher->group(3, NULL, length, status);
2059 REGEX_CHECK_STATUS;
2060 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2061 REGEX_ASSERT(length == 2);
2062 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063 utext_close(result);
2064
2065 result = matcher->group(3, &destText, length, status);
2066 REGEX_CHECK_STATUS;
2067 REGEX_ASSERT(result == &destText);
2068 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2069 REGEX_ASSERT(length == 2);
2070 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071 utext_close(result);
2072
2073 // Capture Group number out of range.
2074 status = U_ZERO_ERROR;
2075 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2076 status = U_ZERO_ERROR;
2077 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2078 status = U_ZERO_ERROR;
2079 matcher->reset();
2080 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2081
2082 delete matcher;
2083 delete pat;
2084
2085 utext_close(&destText);
2086 utext_close(&input);
2087 utext_close(&re);
2088 }
2089
2090 //
2091 // find
2092 //
2093 {
2094 int32_t flags=0;
2095 UParseError pe;
2096 UErrorCode status=U_ZERO_ERROR;
2097 UText re=UTEXT_INITIALIZER;
2098 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2099 utext_openUTF8(&re, str_abc, -1, &status);
2100
2101 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2102 REGEX_CHECK_STATUS;
2103 UText input = UTEXT_INITIALIZER;
2104 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2105 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2106 // 012345678901234567
2107
2108 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2109 REGEX_CHECK_STATUS;
2110 REGEX_ASSERT(matcher->find());
2111 REGEX_ASSERT(matcher->start(status) == 1);
2112 REGEX_ASSERT(matcher->find());
2113 REGEX_ASSERT(matcher->start(status) == 6);
2114 REGEX_ASSERT(matcher->find());
2115 REGEX_ASSERT(matcher->start(status) == 12);
2116 REGEX_ASSERT(matcher->find() == FALSE);
2117 REGEX_ASSERT(matcher->find() == FALSE);
2118
2119 matcher->reset();
2120 REGEX_ASSERT(matcher->find());
2121 REGEX_ASSERT(matcher->start(status) == 1);
2122
2123 REGEX_ASSERT(matcher->find(0, status));
2124 REGEX_ASSERT(matcher->start(status) == 1);
2125 REGEX_ASSERT(matcher->find(1, status));
2126 REGEX_ASSERT(matcher->start(status) == 1);
2127 REGEX_ASSERT(matcher->find(2, status));
2128 REGEX_ASSERT(matcher->start(status) == 6);
2129 REGEX_ASSERT(matcher->find(12, status));
2130 REGEX_ASSERT(matcher->start(status) == 12);
2131 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2132 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2133 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2134 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2135
2136 status = U_ZERO_ERROR;
2137 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138 status = U_ZERO_ERROR;
2139 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2140
2141 REGEX_ASSERT(matcher->groupCount() == 0);
2142
2143 delete matcher;
2144 delete pat;
2145
2146 utext_close(&input);
2147 utext_close(&re);
2148 }
2149
2150
2151 //
2152 // find, with \G in pattern (true if at the end of a previous match).
2153 //
2154 {
2155 int32_t flags=0;
2156 UParseError pe;
2157 UErrorCode status=U_ZERO_ERROR;
2158 UText re=UTEXT_INITIALIZER;
2159 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2160 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2161
2162 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2163
2164 REGEX_CHECK_STATUS;
2165 UText input = UTEXT_INITIALIZER;
2166 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2167 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2168 // 012345678901234567
2169
2170 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2171 REGEX_CHECK_STATUS;
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 0);
2174 REGEX_ASSERT(matcher->start(1, status) == -1);
2175 REGEX_ASSERT(matcher->start(2, status) == 1);
2176
2177 REGEX_ASSERT(matcher->find());
2178 REGEX_ASSERT(matcher->start(status) == 4);
2179 REGEX_ASSERT(matcher->start(1, status) == 4);
2180 REGEX_ASSERT(matcher->start(2, status) == -1);
2181 REGEX_CHECK_STATUS;
2182
2183 delete matcher;
2184 delete pat;
2185
2186 utext_close(&input);
2187 utext_close(&re);
2188 }
2189
2190 //
2191 // find with zero length matches, match position should bump ahead
2192 // to prevent loops.
2193 //
2194 {
2195 int32_t i;
2196 UErrorCode status=U_ZERO_ERROR;
2197 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2198 // using an always-true look-ahead.
2199 REGEX_CHECK_STATUS;
2200 UText s = UTEXT_INITIALIZER;
2201 utext_openUTF8(&s, " ", -1, &status);
2202 m.reset(&s);
2203 for (i=0; ; i++) {
2204 if (m.find() == FALSE) {
2205 break;
2206 }
2207 REGEX_ASSERT(m.start(status) == i);
2208 REGEX_ASSERT(m.end(status) == i);
2209 }
2210 REGEX_ASSERT(i==5);
2211
2212 // Check that the bump goes over characters outside the BMP OK
2213 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2214 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2215 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2216 m.reset(&s);
2217 for (i=0; ; i+=4) {
2218 if (m.find() == FALSE) {
2219 break;
2220 }
2221 REGEX_ASSERT(m.start(status) == i);
2222 REGEX_ASSERT(m.end(status) == i);
2223 }
2224 REGEX_ASSERT(i==20);
2225
2226 utext_close(&s);
2227 }
2228 {
2229 // find() loop breaking test.
2230 // with pattern of /.?/, should see a series of one char matches, then a single
2231 // match of zero length at the end of the input string.
2232 int32_t i;
2233 UErrorCode status=U_ZERO_ERROR;
2234 RegexMatcher m(".?", 0, status);
2235 REGEX_CHECK_STATUS;
2236 UText s = UTEXT_INITIALIZER;
2237 utext_openUTF8(&s, " ", -1, &status);
2238 m.reset(&s);
2239 for (i=0; ; i++) {
2240 if (m.find() == FALSE) {
2241 break;
2242 }
2243 REGEX_ASSERT(m.start(status) == i);
2244 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2245 }
2246 REGEX_ASSERT(i==5);
2247
2248 utext_close(&s);
2249 }
2250
2251
2252 //
2253 // Matchers with no input string behave as if they had an empty input string.
2254 //
2255
2256 {
2257 UErrorCode status = U_ZERO_ERROR;
2258 RegexMatcher m(".?", 0, status);
2259 REGEX_CHECK_STATUS;
2260 REGEX_ASSERT(m.find());
2261 REGEX_ASSERT(m.start(status) == 0);
2262 REGEX_ASSERT(m.input() == "");
2263 }
2264 {
2265 UErrorCode status = U_ZERO_ERROR;
2266 RegexPattern *p = RegexPattern::compile(".", 0, status);
2267 RegexMatcher *m = p->matcher(status);
2268 REGEX_CHECK_STATUS;
2269
2270 REGEX_ASSERT(m->find() == FALSE);
2271 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2272 delete m;
2273 delete p;
2274 }
2275
2276 //
2277 // Regions
2278 //
2279 {
2280 UErrorCode status = U_ZERO_ERROR;
2281 UText testPattern = UTEXT_INITIALIZER;
2282 UText testText = UTEXT_INITIALIZER;
2283 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2284 REGEX_VERBOSE_TEXT(&testPattern);
2285 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2286 REGEX_VERBOSE_TEXT(&testText);
2287
2288 RegexMatcher m(&testPattern, &testText, 0, status);
2289 REGEX_CHECK_STATUS;
2290 REGEX_ASSERT(m.regionStart() == 0);
2291 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2292 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2293 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2294
2295 m.region(2,4, status);
2296 REGEX_CHECK_STATUS;
2297 REGEX_ASSERT(m.matches(status));
2298 REGEX_ASSERT(m.start(status)==2);
2299 REGEX_ASSERT(m.end(status)==4);
2300 REGEX_CHECK_STATUS;
2301
2302 m.reset();
2303 REGEX_ASSERT(m.regionStart() == 0);
2304 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2305
2306 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2307 REGEX_VERBOSE_TEXT(&testText);
2308 m.reset(&testText);
2309 REGEX_ASSERT(m.regionStart() == 0);
2310 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2311
2312 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2313 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2314 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2315 REGEX_ASSERT(&m == &m.reset());
2316 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2317
2318 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2319 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2320 REGEX_ASSERT(&m == &m.reset());
2321 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2322
2323 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2324 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2325 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2326 REGEX_ASSERT(&m == &m.reset());
2327 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2328
2329 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2330 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2331 REGEX_ASSERT(&m == &m.reset());
2332 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2333
2334 utext_close(&testText);
2335 utext_close(&testPattern);
2336 }
2337
2338 //
2339 // hitEnd() and requireEnd()
2340 //
2341 {
2342 UErrorCode status = U_ZERO_ERROR;
2343 UText testPattern = UTEXT_INITIALIZER;
2344 UText testText = UTEXT_INITIALIZER;
2345 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2346 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2347 utext_openUTF8(&testPattern, str_, -1, &status);
2348 utext_openUTF8(&testText, str_aabb, -1, &status);
2349
2350 RegexMatcher m1(&testPattern, &testText, 0, status);
2351 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2352 REGEX_ASSERT(m1.hitEnd() == TRUE);
2353 REGEX_ASSERT(m1.requireEnd() == FALSE);
2354 REGEX_CHECK_STATUS;
2355
2356 status = U_ZERO_ERROR;
2357 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2358 utext_openUTF8(&testPattern, str_a, -1, &status);
2359 RegexMatcher m2(&testPattern, &testText, 0, status);
2360 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2361 REGEX_ASSERT(m2.hitEnd() == FALSE);
2362 REGEX_ASSERT(m2.requireEnd() == FALSE);
2363 REGEX_CHECK_STATUS;
2364
2365 status = U_ZERO_ERROR;
2366 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2367 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2368 RegexMatcher m3(&testPattern, &testText, 0, status);
2369 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2370 REGEX_ASSERT(m3.hitEnd() == TRUE);
2371 REGEX_ASSERT(m3.requireEnd() == TRUE);
2372 REGEX_CHECK_STATUS;
2373
2374 utext_close(&testText);
2375 utext_close(&testPattern);
2376 }
2377 }
2378
2379
2380 //---------------------------------------------------------------------------
2381 //
2382 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2383 // Replace family of functions.
2384 //
2385 //---------------------------------------------------------------------------
2386 void RegexTest::API_Replace_UTF8() {
2387 //
2388 // Replace
2389 //
2390 int32_t flags=0;
2391 UParseError pe;
2392 UErrorCode status=U_ZERO_ERROR;
2393
2394 UText re=UTEXT_INITIALIZER;
2395 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2396 REGEX_VERBOSE_TEXT(&re);
2397 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2398 REGEX_CHECK_STATUS;
2399
2400 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2401 // 012345678901234567
2402 UText dataText = UTEXT_INITIALIZER;
2403 utext_openUTF8(&dataText, data, -1, &status);
2404 REGEX_CHECK_STATUS;
2405 REGEX_VERBOSE_TEXT(&dataText);
2406 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2407
2408 //
2409 // Plain vanilla matches.
2410 //
2411 UnicodeString dest;
2412 UText destText = UTEXT_INITIALIZER;
2413 utext_openUnicodeString(&destText, &dest, &status);
2414 UText *result;
2415
2416 UText replText = UTEXT_INITIALIZER;
2417
2418 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2419 utext_openUTF8(&replText, str_yz, -1, &status);
2420 REGEX_VERBOSE_TEXT(&replText);
2421 result = matcher->replaceFirst(&replText, NULL, status);
2422 REGEX_CHECK_STATUS;
2423 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2424 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2425 utext_close(result);
2426 result = matcher->replaceFirst(&replText, &destText, status);
2427 REGEX_CHECK_STATUS;
2428 REGEX_ASSERT(result == &destText);
2429 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2430
2431 result = matcher->replaceAll(&replText, NULL, status);
2432 REGEX_CHECK_STATUS;
2433 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2434 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2435 utext_close(result);
2436
2437 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2438 result = matcher->replaceAll(&replText, &destText, status);
2439 REGEX_CHECK_STATUS;
2440 REGEX_ASSERT(result == &destText);
2441 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2442
2443 //
2444 // Plain vanilla non-matches.
2445 //
2446 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2447 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2448 matcher->reset(&dataText);
2449
2450 result = matcher->replaceFirst(&replText, NULL, status);
2451 REGEX_CHECK_STATUS;
2452 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2453 utext_close(result);
2454 result = matcher->replaceFirst(&replText, &destText, status);
2455 REGEX_CHECK_STATUS;
2456 REGEX_ASSERT(result == &destText);
2457 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2458
2459 result = matcher->replaceAll(&replText, NULL, status);
2460 REGEX_CHECK_STATUS;
2461 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2462 utext_close(result);
2463 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2464 result = matcher->replaceAll(&replText, &destText, status);
2465 REGEX_CHECK_STATUS;
2466 REGEX_ASSERT(result == &destText);
2467 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2468
2469 //
2470 // Empty source string
2471 //
2472 utext_openUTF8(&dataText, NULL, 0, &status);
2473 matcher->reset(&dataText);
2474
2475 result = matcher->replaceFirst(&replText, NULL, status);
2476 REGEX_CHECK_STATUS;
2477 REGEX_ASSERT_UTEXT_UTF8("", result);
2478 utext_close(result);
2479 result = matcher->replaceFirst(&replText, &destText, status);
2480 REGEX_CHECK_STATUS;
2481 REGEX_ASSERT(result == &destText);
2482 REGEX_ASSERT_UTEXT_UTF8("", result);
2483
2484 result = matcher->replaceAll(&replText, NULL, status);
2485 REGEX_CHECK_STATUS;
2486 REGEX_ASSERT_UTEXT_UTF8("", result);
2487 utext_close(result);
2488 result = matcher->replaceAll(&replText, &destText, status);
2489 REGEX_CHECK_STATUS;
2490 REGEX_ASSERT(result == &destText);
2491 REGEX_ASSERT_UTEXT_UTF8("", result);
2492
2493 //
2494 // Empty substitution string
2495 //
2496 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2497 matcher->reset(&dataText);
2498
2499 utext_openUTF8(&replText, NULL, 0, &status);
2500 result = matcher->replaceFirst(&replText, NULL, status);
2501 REGEX_CHECK_STATUS;
2502 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2503 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2504 utext_close(result);
2505 result = matcher->replaceFirst(&replText, &destText, status);
2506 REGEX_CHECK_STATUS;
2507 REGEX_ASSERT(result == &destText);
2508 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2509
2510 result = matcher->replaceAll(&replText, NULL, status);
2511 REGEX_CHECK_STATUS;
2512 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2513 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2514 utext_close(result);
2515 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2516 result = matcher->replaceAll(&replText, &destText, status);
2517 REGEX_CHECK_STATUS;
2518 REGEX_ASSERT(result == &destText);
2519 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2520
2521 //
2522 // match whole string
2523 //
2524 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2525 utext_openUTF8(&dataText, str_abc, -1, &status);
2526 matcher->reset(&dataText);
2527
2528 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2529 utext_openUTF8(&replText, str_xyz, -1, &status);
2530 result = matcher->replaceFirst(&replText, NULL, status);
2531 REGEX_CHECK_STATUS;
2532 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2533 utext_close(result);
2534 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2535 result = matcher->replaceFirst(&replText, &destText, status);
2536 REGEX_CHECK_STATUS;
2537 REGEX_ASSERT(result == &destText);
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539
2540 result = matcher->replaceAll(&replText, NULL, status);
2541 REGEX_CHECK_STATUS;
2542 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2543 utext_close(result);
2544 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2545 result = matcher->replaceAll(&replText, &destText, status);
2546 REGEX_CHECK_STATUS;
2547 REGEX_ASSERT(result == &destText);
2548 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2549
2550 //
2551 // Capture Group, simple case
2552 //
2553 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2554 utext_openUTF8(&re, str_add, -1, &status);
2555 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2556 REGEX_CHECK_STATUS;
2557
2558 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2559 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2560 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2561 REGEX_CHECK_STATUS;
2562
2563 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2564 utext_openUTF8(&replText, str_11, -1, &status);
2565 result = matcher2->replaceFirst(&replText, NULL, status);
2566 REGEX_CHECK_STATUS;
2567 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2568 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2569 utext_close(result);
2570 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2571 result = matcher2->replaceFirst(&replText, &destText, status);
2572 REGEX_CHECK_STATUS;
2573 REGEX_ASSERT(result == &destText);
2574 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2575
2576 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2577 utext_openUTF8(&replText, str_v, -1, &status);
2578 REGEX_VERBOSE_TEXT(&replText);
2579 result = matcher2->replaceFirst(&replText, NULL, status);
2580 REGEX_CHECK_STATUS;
2581 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2582 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2583 utext_close(result);
2584 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2585 result = matcher2->replaceFirst(&replText, &destText, status);
2586 REGEX_CHECK_STATUS;
2587 REGEX_ASSERT(result == &destText);
2588 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2589
2590 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2591 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2592 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2593 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2594 result = matcher2->replaceFirst(&replText, NULL, status);
2595 REGEX_CHECK_STATUS;
2596 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2597 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2598 utext_close(result);
2599 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2600 result = matcher2->replaceFirst(&replText, &destText, status);
2601 REGEX_CHECK_STATUS;
2602 REGEX_ASSERT(result == &destText);
2603 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2604
2605 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2606 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2607 // 012345678901234567890123456
2608 supplDigitChars[22] = 0xF0;
2609 supplDigitChars[23] = 0x9D;
2610 supplDigitChars[24] = 0x9F;
2611 supplDigitChars[25] = 0x8F;
2612 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2613
2614 result = matcher2->replaceFirst(&replText, NULL, status);
2615 REGEX_CHECK_STATUS;
2616 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2617 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2618 utext_close(result);
2619 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2620 result = matcher2->replaceFirst(&replText, &destText, status);
2621 REGEX_CHECK_STATUS;
2622 REGEX_ASSERT(result == &destText);
2623 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2624 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2625 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2626 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2627 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2628 utext_close(result);
2629 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2630 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2631 REGEX_ASSERT(result == &destText);
2632 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2633
2634 //
2635 // Replacement String with \u hex escapes
2636 //
2637 {
2638 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2639 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2640 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2641 utext_openUTF8(&replText, str_u0043, -1, &status);
2642 matcher->reset(&dataText);
2643
2644 result = matcher->replaceAll(&replText, NULL, status);
2645 REGEX_CHECK_STATUS;
2646 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2647 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2648 utext_close(result);
2649 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2650 result = matcher->replaceAll(&replText, &destText, status);
2651 REGEX_CHECK_STATUS;
2652 REGEX_ASSERT(result == &destText);
2653 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2654 }
2655 {
2656 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2657 utext_openUTF8(&dataText, str_abc, -1, &status);
2658 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2659 utext_openUTF8(&replText, str_U00010000, -1, &status);
2660 matcher->reset(&dataText);
2661
2662 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2663 // 0123456789
2664 expected[2] = 0xF0;
2665 expected[3] = 0x90;
2666 expected[4] = 0x80;
2667 expected[5] = 0x80;
2668
2669 result = matcher->replaceAll(&replText, NULL, status);
2670 REGEX_CHECK_STATUS;
2671 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2672 utext_close(result);
2673 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2674 result = matcher->replaceAll(&replText, &destText, status);
2675 REGEX_CHECK_STATUS;
2676 REGEX_ASSERT(result == &destText);
2677 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2678 }
2679 // TODO: need more through testing of capture substitutions.
2680
2681 // Bug 4057
2682 //
2683 {
2684 status = U_ZERO_ERROR;
2685 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2686 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2687 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2688 utext_openUTF8(&re, str_ssee, -1, &status);
2689 utext_openUTF8(&dataText, str_blah, -1, &status);
2690 utext_openUTF8(&replText, str_ooh, -1, &status);
2691
2692 RegexMatcher m(&re, 0, status);
2693 REGEX_CHECK_STATUS;
2694
2695 UnicodeString result;
2696 UText resultText = UTEXT_INITIALIZER;
2697 utext_openUnicodeString(&resultText, &result, &status);
2698
2699 // Multiple finds do NOT bump up the previous appendReplacement postion.
2700 m.reset(&dataText);
2701 m.find();
2702 m.find();
2703 m.appendReplacement(&resultText, &replText, status);
2704 REGEX_CHECK_STATUS;
2705 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2706 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2707
2708 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2709 status = U_ZERO_ERROR;
2710 result.truncate(0);
2711 utext_openUnicodeString(&resultText, &result, &status);
2712 m.reset(10, status);
2713 m.find();
2714 m.find();
2715 m.appendReplacement(&resultText, &replText, status);
2716 REGEX_CHECK_STATUS;
2717 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2718 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2719
2720 // find() at interior of string, appendReplacement still starts at beginning.
2721 status = U_ZERO_ERROR;
2722 result.truncate(0);
2723 utext_openUnicodeString(&resultText, &result, &status);
2724 m.reset();
2725 m.find(10, status);
2726 m.find();
2727 m.appendReplacement(&resultText, &replText, status);
2728 REGEX_CHECK_STATUS;
2729 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2730 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2731
2732 m.appendTail(&resultText, status);
2733 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2734 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2735
2736 utext_close(&resultText);
2737 }
2738
2739 delete matcher2;
2740 delete pat2;
2741 delete matcher;
2742 delete pat;
2743
2744 utext_close(&dataText);
2745 utext_close(&replText);
2746 utext_close(&destText);
2747 utext_close(&re);
2748 }
2749
2750
2751 //---------------------------------------------------------------------------
2752 //
2753 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2754 // present and nominally working.
2755 //
2756 //---------------------------------------------------------------------------
2757 void RegexTest::API_Pattern_UTF8() {
2758 RegexPattern pata; // Test default constructor to not crash.
2759 RegexPattern patb;
2760
2761 REGEX_ASSERT(pata == patb);
2762 REGEX_ASSERT(pata == pata);
2763
2764 UText re1 = UTEXT_INITIALIZER;
2765 UText re2 = UTEXT_INITIALIZER;
2766 UErrorCode status = U_ZERO_ERROR;
2767 UParseError pe;
2768
2769 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2770 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2771 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2772 utext_openUTF8(&re2, str_def, -1, &status);
2773
2774 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2775 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2776 REGEX_CHECK_STATUS;
2777 REGEX_ASSERT(*pat1 == *pat1);
2778 REGEX_ASSERT(*pat1 != pata);
2779
2780 // Assign
2781 patb = *pat1;
2782 REGEX_ASSERT(patb == *pat1);
2783
2784 // Copy Construct
2785 RegexPattern patc(*pat1);
2786 REGEX_ASSERT(patc == *pat1);
2787 REGEX_ASSERT(patb == patc);
2788 REGEX_ASSERT(pat1 != pat2);
2789 patb = *pat2;
2790 REGEX_ASSERT(patb != patc);
2791 REGEX_ASSERT(patb == *pat2);
2792
2793 // Compile with no flags.
2794 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2795 REGEX_ASSERT(*pat1a == *pat1);
2796
2797 REGEX_ASSERT(pat1a->flags() == 0);
2798
2799 // Compile with different flags should be not equal
2800 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2801 REGEX_CHECK_STATUS;
2802
2803 REGEX_ASSERT(*pat1b != *pat1a);
2804 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2805 REGEX_ASSERT(pat1a->flags() == 0);
2806 delete pat1b;
2807
2808 // clone
2809 RegexPattern *pat1c = pat1->clone();
2810 REGEX_ASSERT(*pat1c == *pat1);
2811 REGEX_ASSERT(*pat1c != *pat2);
2812
2813 delete pat1c;
2814 delete pat1a;
2815 delete pat1;
2816 delete pat2;
2817
2818 utext_close(&re1);
2819 utext_close(&re2);
2820
2821
2822 //
2823 // Verify that a matcher created from a cloned pattern works.
2824 // (Jitterbug 3423)
2825 //
2826 {
2827 UErrorCode status = U_ZERO_ERROR;
2828 UText pattern = UTEXT_INITIALIZER;
2829 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2830 utext_openUTF8(&pattern, str_pL, -1, &status);
2831
2832 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2833 RegexPattern *pClone = pSource->clone();
2834 delete pSource;
2835 RegexMatcher *mFromClone = pClone->matcher(status);
2836 REGEX_CHECK_STATUS;
2837
2838 UText input = UTEXT_INITIALIZER;
2839 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2840 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2841 mFromClone->reset(&input);
2842 REGEX_ASSERT(mFromClone->find() == TRUE);
2843 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2844 REGEX_ASSERT(mFromClone->find() == TRUE);
2845 REGEX_ASSERT(mFromClone->group(status) == "World");
2846 REGEX_ASSERT(mFromClone->find() == FALSE);
2847 delete mFromClone;
2848 delete pClone;
2849
2850 utext_close(&input);
2851 utext_close(&pattern);
2852 }
2853
2854 //
2855 // matches convenience API
2856 //
2857 {
2858 UErrorCode status = U_ZERO_ERROR;
2859 UText pattern = UTEXT_INITIALIZER;
2860 UText input = UTEXT_INITIALIZER;
2861
2862 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2863 utext_openUTF8(&input, str_randominput, -1, &status);
2864
2865 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2866 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2867 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2868 REGEX_CHECK_STATUS;
2869
2870 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2871 utext_openUTF8(&pattern, str_abc, -1, &status);
2872 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2873 REGEX_CHECK_STATUS;
2874
2875 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2876 utext_openUTF8(&pattern, str_nput, -1, &status);
2877 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2878 REGEX_CHECK_STATUS;
2879
2880 utext_openUTF8(&pattern, str_randominput, -1, &status);
2881 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2882 REGEX_CHECK_STATUS;
2883
2884 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2885 utext_openUTF8(&pattern, str_u, -1, &status);
2886 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2887 REGEX_CHECK_STATUS;
2888
2889 utext_openUTF8(&input, str_abc, -1, &status);
2890 utext_openUTF8(&pattern, str_abc, -1, &status);
2891 status = U_INDEX_OUTOFBOUNDS_ERROR;
2892 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2893 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2894
2895 utext_close(&input);
2896 utext_close(&pattern);
2897 }
2898
2899
2900 //
2901 // Split()
2902 //
2903 status = U_ZERO_ERROR;
2904 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2905 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2906 pat1 = RegexPattern::compile(&re1, pe, status);
2907 REGEX_CHECK_STATUS;
2908 UnicodeString fields[10];
2909
2910 int32_t n;
2911 n = pat1->split("Now is the time", fields, 10, status);
2912 REGEX_CHECK_STATUS;
2913 REGEX_ASSERT(n==4);
2914 REGEX_ASSERT(fields[0]=="Now");
2915 REGEX_ASSERT(fields[1]=="is");
2916 REGEX_ASSERT(fields[2]=="the");
2917 REGEX_ASSERT(fields[3]=="time");
2918 REGEX_ASSERT(fields[4]=="");
2919
2920 n = pat1->split("Now is the time", fields, 2, status);
2921 REGEX_CHECK_STATUS;
2922 REGEX_ASSERT(n==2);
2923 REGEX_ASSERT(fields[0]=="Now");
2924 REGEX_ASSERT(fields[1]=="is the time");
2925 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2926
2927 fields[1] = "*";
2928 status = U_ZERO_ERROR;
2929 n = pat1->split("Now is the time", fields, 1, status);
2930 REGEX_CHECK_STATUS;
2931 REGEX_ASSERT(n==1);
2932 REGEX_ASSERT(fields[0]=="Now is the time");
2933 REGEX_ASSERT(fields[1]=="*");
2934 status = U_ZERO_ERROR;
2935
2936 n = pat1->split(" Now is the time ", fields, 10, status);
2937 REGEX_CHECK_STATUS;
2938 REGEX_ASSERT(n==6);
2939 REGEX_ASSERT(fields[0]=="");
2940 REGEX_ASSERT(fields[1]=="Now");
2941 REGEX_ASSERT(fields[2]=="is");
2942 REGEX_ASSERT(fields[3]=="the");
2943 REGEX_ASSERT(fields[4]=="time");
2944 REGEX_ASSERT(fields[5]=="");
2945 REGEX_ASSERT(fields[6]=="");
2946
2947 fields[2] = "*";
2948 n = pat1->split(" ", fields, 10, status);
2949 REGEX_CHECK_STATUS;
2950 REGEX_ASSERT(n==2);
2951 REGEX_ASSERT(fields[0]=="");
2952 REGEX_ASSERT(fields[1]=="");
2953 REGEX_ASSERT(fields[2]=="*");
2954
2955 fields[0] = "foo";
2956 n = pat1->split("", fields, 10, status);
2957 REGEX_CHECK_STATUS;
2958 REGEX_ASSERT(n==0);
2959 REGEX_ASSERT(fields[0]=="foo");
2960
2961 delete pat1;
2962
2963 // split, with a pattern with (capture)
2964 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2965 pat1 = RegexPattern::compile(&re1, pe, status);
2966 REGEX_CHECK_STATUS;
2967
2968 status = U_ZERO_ERROR;
2969 fields[6] = fields[7] = "*";
2970 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2971 REGEX_CHECK_STATUS;
2972 REGEX_ASSERT(n==7);
2973 REGEX_ASSERT(fields[0]=="");
2974 REGEX_ASSERT(fields[1]=="a");
2975 REGEX_ASSERT(fields[2]=="Now is ");
2976 REGEX_ASSERT(fields[3]=="b");
2977 REGEX_ASSERT(fields[4]=="the time");
2978 REGEX_ASSERT(fields[5]=="c");
2979 REGEX_ASSERT(fields[6]=="");
2980 REGEX_ASSERT(fields[7]=="*");
2981 REGEX_ASSERT(status==U_ZERO_ERROR);
2982
2983 fields[6] = fields[7] = "*";
2984 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2985 REGEX_CHECK_STATUS;
2986 REGEX_ASSERT(n==7);
2987 REGEX_ASSERT(fields[0]==" ");
2988 REGEX_ASSERT(fields[1]=="a");
2989 REGEX_ASSERT(fields[2]=="Now is ");
2990 REGEX_ASSERT(fields[3]=="b");
2991 REGEX_ASSERT(fields[4]=="the time");
2992 REGEX_ASSERT(fields[5]=="c");
2993 REGEX_ASSERT(fields[6]=="");
2994 REGEX_ASSERT(fields[7]=="*");
2995
2996 status = U_ZERO_ERROR;
2997 fields[6] = "foo";
2998 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
2999 REGEX_CHECK_STATUS;
3000 REGEX_ASSERT(n==6);
3001 REGEX_ASSERT(fields[0]==" ");
3002 REGEX_ASSERT(fields[1]=="a");
3003 REGEX_ASSERT(fields[2]=="Now is ");
3004 REGEX_ASSERT(fields[3]=="b");
3005 REGEX_ASSERT(fields[4]=="the time");
3006 REGEX_ASSERT(fields[5]==" ");
3007 REGEX_ASSERT(fields[6]=="foo");
3008
3009 status = U_ZERO_ERROR;
3010 fields[5] = "foo";
3011 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3012 REGEX_CHECK_STATUS;
3013 REGEX_ASSERT(n==5);
3014 REGEX_ASSERT(fields[0]==" ");
3015 REGEX_ASSERT(fields[1]=="a");
3016 REGEX_ASSERT(fields[2]=="Now is ");
3017 REGEX_ASSERT(fields[3]=="b");
3018 REGEX_ASSERT(fields[4]=="the time<c>");
3019 REGEX_ASSERT(fields[5]=="foo");
3020
3021 status = U_ZERO_ERROR;
3022 fields[5] = "foo";
3023 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3024 REGEX_CHECK_STATUS;
3025 REGEX_ASSERT(n==5);
3026 REGEX_ASSERT(fields[0]==" ");
3027 REGEX_ASSERT(fields[1]=="a");
3028 REGEX_ASSERT(fields[2]=="Now is ");
3029 REGEX_ASSERT(fields[3]=="b");
3030 REGEX_ASSERT(fields[4]=="the time");
3031 REGEX_ASSERT(fields[5]=="foo");
3032
3033 status = U_ZERO_ERROR;
3034 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3035 REGEX_CHECK_STATUS;
3036 REGEX_ASSERT(n==4);
3037 REGEX_ASSERT(fields[0]==" ");
3038 REGEX_ASSERT(fields[1]=="a");
3039 REGEX_ASSERT(fields[2]=="Now is ");
3040 REGEX_ASSERT(fields[3]=="the time<c>");
3041 status = U_ZERO_ERROR;
3042 delete pat1;
3043
3044 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3045 pat1 = RegexPattern::compile(&re1, pe, status);
3046 REGEX_CHECK_STATUS;
3047 n = pat1->split("1-10,20", fields, 10, status);
3048 REGEX_CHECK_STATUS;
3049 REGEX_ASSERT(n==5);
3050 REGEX_ASSERT(fields[0]=="1");
3051 REGEX_ASSERT(fields[1]=="-");
3052 REGEX_ASSERT(fields[2]=="10");
3053 REGEX_ASSERT(fields[3]==",");
3054 REGEX_ASSERT(fields[4]=="20");
3055 delete pat1;
3056
3057
3058 //
3059 // split of a UText based string, with library allocating output UTexts.
3060 //
3061 {
3062 status = U_ZERO_ERROR;
3063 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3064 UnicodeString stringToSplit("first:second:third");
3065 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3066 REGEX_CHECK_STATUS;
3067
3068 UText *splits[10] = {NULL};
3069 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3070 REGEX_CHECK_STATUS;
3071 REGEX_ASSERT(numFields == 5);
3072 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3073 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3074 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3075 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3076 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3077 REGEX_ASSERT(splits[5] == NULL);
3078
3079 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3080 if (splits[i]) {
3081 utext_close(splits[i]);
3082 splits[i] = NULL;
3083 }
3084 }
3085 utext_close(textToSplit);
3086 }
3087
3088
3089 //
3090 // RegexPattern::pattern() and patternText()
3091 //
3092 pat1 = new RegexPattern();
3093 REGEX_ASSERT(pat1->pattern() == "");
3094 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3095 delete pat1;
3096 const char *helloWorldInvariant = "(Hello, world)*";
3097 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3098 pat1 = RegexPattern::compile(&re1, pe, status);
3099 REGEX_CHECK_STATUS;
3100 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3101 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3102 delete pat1;
3103
3104 utext_close(&re1);
3105 }
3106
3107
3108 //---------------------------------------------------------------------------
3109 //
3110 // Extended A more thorough check for features of regex patterns
3111 // The test cases are in a separate data file,
3112 // source/tests/testdata/regextst.txt
3113 // A description of the test data format is included in that file.
3114 //
3115 //---------------------------------------------------------------------------
3116
3117 const char *
3118 RegexTest::getPath(char buffer[2048], const char *filename) {
3119 UErrorCode status=U_ZERO_ERROR;
3120 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3121 if (U_FAILURE(status)) {
3122 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3123 return NULL;
3124 }
3125
3126 strcpy(buffer, testDataDirectory);
3127 strcat(buffer, filename);
3128 return buffer;
3129 }
3130
3131 void RegexTest::Extended() {
3132 char tdd[2048];
3133 const char *srcPath;
3134 UErrorCode status = U_ZERO_ERROR;
3135 int32_t lineNum = 0;
3136
3137 //
3138 // Open and read the test data file.
3139 //
3140 srcPath=getPath(tdd, "regextst.txt");
3141 if(srcPath==NULL) {
3142 return; /* something went wrong, error already output */
3143 }
3144
3145 int32_t len;
3146 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3147 if (U_FAILURE(status)) {
3148 return; /* something went wrong, error already output */
3149 }
3150
3151 //
3152 // Put the test data into a UnicodeString
3153 //
3154 UnicodeString testString(FALSE, testData, len);
3155
3156 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3157 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3158 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3159
3160 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3161 UnicodeString testPattern; // The pattern for test from the test file.
3162 UnicodeString testFlags; // the flags for a test.
3163 UnicodeString matchString; // The marked up string to be used as input
3164
3165 if (U_FAILURE(status)){
3166 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3167 delete [] testData;
3168 return;
3169 }
3170
3171 //
3172 // Loop over the test data file, once per line.
3173 //
3174 while (lineMat.find()) {
3175 lineNum++;
3176 if (U_FAILURE(status)) {
3177 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3178 }
3179
3180 status = U_ZERO_ERROR;
3181 UnicodeString testLine = lineMat.group(1, status);
3182 if (testLine.length() == 0) {
3183 continue;
3184 }
3185
3186 //
3187 // Parse the test line. Skip blank and comment only lines.
3188 // Separate out the three main fields - pattern, flags, target.
3189 //
3190
3191 commentMat.reset(testLine);
3192 if (commentMat.lookingAt(status)) {
3193 // This line is a comment, or blank.
3194 continue;
3195 }
3196
3197 //
3198 // Pull out the pattern field, remove it from the test file line.
3199 //
3200 quotedStuffMat.reset(testLine);
3201 if (quotedStuffMat.lookingAt(status)) {
3202 testPattern = quotedStuffMat.group(2, status);
3203 testLine.remove(0, quotedStuffMat.end(0, status));
3204 } else {
3205 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3206 continue;
3207 }
3208
3209
3210 //
3211 // Pull out the flags from the test file line.
3212 //
3213 flagsMat.reset(testLine);
3214 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3215 testFlags = flagsMat.group(1, status);
3216 if (flagsMat.group(2, status).length() > 0) {
3217 errln("Bad Match flag at line %d. Scanning %c\n",
3218 lineNum, flagsMat.group(2, status).charAt(0));
3219 continue;
3220 }
3221 testLine.remove(0, flagsMat.end(0, status));
3222
3223 //
3224 // Pull out the match string, as a whole.
3225 // We'll process the <tags> later.
3226 //
3227 quotedStuffMat.reset(testLine);
3228 if (quotedStuffMat.lookingAt(status)) {
3229 matchString = quotedStuffMat.group(2, status);
3230 testLine.remove(0, quotedStuffMat.end(0, status));
3231 } else {
3232 errln("Bad match string at test file line %d", lineNum);
3233 continue;
3234 }
3235
3236 //
3237 // The only thing left from the input line should be an optional trailing comment.
3238 //
3239 commentMat.reset(testLine);
3240 if (commentMat.lookingAt(status) == FALSE) {
3241 errln("Line %d: unexpected characters at end of test line.", lineNum);
3242 continue;
3243 }
3244
3245 //
3246 // Run the test
3247 //
3248 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3249 }
3250
3251 delete [] testData;
3252
3253 }
3254
3255
3256
3257 //---------------------------------------------------------------------------
3258 //
3259 // regex_find(pattern, flags, inputString, lineNumber)
3260 //
3261 // Function to run a single test from the Extended (data driven) tests.
3262 // See file test/testdata/regextst.txt for a description of the
3263 // pattern and inputString fields, and the allowed flags.
3264 // lineNumber is the source line in regextst.txt of the test.
3265 //
3266 //---------------------------------------------------------------------------
3267
3268
3269 // Set a value into a UVector at position specified by a decimal number in
3270 // a UnicodeString. This is a utility function needed by the actual test function,
3271 // which follows.
3272 static void set(UVector &vec, int32_t val, UnicodeString index) {
3273 UErrorCode status=U_ZERO_ERROR;
3274 int32_t idx = 0;
3275 for (int32_t i=0; i<index.length(); i++) {
3276 int32_t d=u_charDigitValue(index.charAt(i));
3277 if (d<0) {return;}
3278 idx = idx*10 + d;
3279 }
3280 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3281 vec.setElementAt(val, idx);
3282 }
3283
3284 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3285 UErrorCode status=U_ZERO_ERROR;
3286 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3287 vec.setElementAt(val, idx);
3288 }
3289
3290 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3291 {
3292 UBool couldFind = TRUE;
3293 UTEXT_SETNATIVEINDEX(utext, 0);
3294 int32_t i = 0;
3295 while (i < unistrOffset) {
3296 UChar32 c = UTEXT_NEXT32(utext);
3297 if (c != U_SENTINEL) {
3298 i += U16_LENGTH(c);
3299 } else {
3300 couldFind = FALSE;
3301 break;
3302 }
3303 }
3304 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3305 return couldFind;
3306 }
3307
3308
3309 void RegexTest::regex_find(const UnicodeString &pattern,
3310 const UnicodeString &flags,
3311 const UnicodeString &inputString,
3312 const char *srcPath,
3313 int32_t line) {
3314 UnicodeString unEscapedInput;
3315 UnicodeString deTaggedInput;
3316
3317 int32_t patternUTF8Length, inputUTF8Length;
3318 char *patternChars = NULL, *inputChars = NULL;
3319 UText patternText = UTEXT_INITIALIZER;
3320 UText inputText = UTEXT_INITIALIZER;
3321 UConverter *UTF8Converter = NULL;
3322
3323 UErrorCode status = U_ZERO_ERROR;
3324 UParseError pe;
3325 RegexPattern *parsePat = NULL;
3326 RegexMatcher *parseMatcher = NULL;
3327 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3328 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3329 UVector groupStarts(status);
3330 UVector groupEnds(status);
3331 UVector groupStartsUTF8(status);
3332 UVector groupEndsUTF8(status);
3333 UBool isMatch = FALSE, isUTF8Match = FALSE;
3334 UBool failed = FALSE;
3335 int32_t numFinds;
3336 int32_t i;
3337 UBool useMatchesFunc = FALSE;
3338 UBool useLookingAtFunc = FALSE;
3339 int32_t regionStart = -1;
3340 int32_t regionEnd = -1;
3341 int32_t regionStartUTF8 = -1;
3342 int32_t regionEndUTF8 = -1;
3343
3344
3345 //
3346 // Compile the caller's pattern
3347 //
3348 uint32_t bflags = 0;
3349 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3350 bflags |= UREGEX_CASE_INSENSITIVE;
3351 }
3352 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3353 bflags |= UREGEX_COMMENTS;
3354 }
3355 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3356 bflags |= UREGEX_DOTALL;
3357 }
3358 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3359 bflags |= UREGEX_MULTILINE;
3360 }
3361
3362 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3363 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3364 }
3365 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3366 bflags |= UREGEX_UNIX_LINES;
3367 }
3368 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3369 bflags |= UREGEX_LITERAL;
3370 }
3371
3372
3373 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3374 if (status != U_ZERO_ERROR) {
3375 #if UCONFIG_NO_BREAK_ITERATION==1
3376 // 'v' test flag means that the test pattern should not compile if ICU was configured
3377 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3378 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3379 goto cleanupAndReturn;
3380 }
3381 #endif
3382 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3383 // Expected pattern compilation error.
3384 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3385 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3386 }
3387 goto cleanupAndReturn;
3388 } else {
3389 // Unexpected pattern compilation error.
3390 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3391 goto cleanupAndReturn;
3392 }
3393 }
3394
3395 UTF8Converter = ucnv_open("UTF8", &status);
3396 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3397
3398 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3399 status = U_ZERO_ERROR; // buffer overflow
3400 patternChars = new char[patternUTF8Length+1];
3401 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3402 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3403
3404 if (status == U_ZERO_ERROR) {
3405 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3406
3407 if (status != U_ZERO_ERROR) {
3408 #if UCONFIG_NO_BREAK_ITERATION==1
3409 // 'v' test flag means that the test pattern should not compile if ICU was configured
3410 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3411 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3412 goto cleanupAndReturn;
3413 }
3414 #endif
3415 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3416 // Expected pattern compilation error.
3417 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3418 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3419 }
3420 goto cleanupAndReturn;
3421 } else {
3422 // Unexpected pattern compilation error.
3423 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3424 goto cleanupAndReturn;
3425 }
3426 }
3427 }
3428
3429 if (UTF8Pattern == NULL) {
3430 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3431 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3432 status = U_ZERO_ERROR;
3433 }
3434
3435 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3436 callerPattern->dumpPattern();
3437 }
3438
3439 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3440 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3441 goto cleanupAndReturn;
3442 }
3443
3444
3445 //
3446 // Number of times find() should be called on the test string, default to 1
3447 //
3448 numFinds = 1;
3449 for (i=2; i<=9; i++) {
3450 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3451 if (numFinds != 1) {
3452 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3453 goto cleanupAndReturn;
3454 }
3455 numFinds = i;
3456 }
3457 }
3458
3459 // 'M' flag. Use matches() instead of find()
3460 if (flags.indexOf((UChar)0x4d) >= 0) {
3461 useMatchesFunc = TRUE;
3462 }
3463 if (flags.indexOf((UChar)0x4c) >= 0) {
3464 useLookingAtFunc = TRUE;
3465 }
3466
3467 //
3468 // Find the tags in the input data, remove them, and record the group boundary
3469 // positions.
3470 //
3471 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3472 REGEX_CHECK_STATUS_L(line);
3473
3474 unEscapedInput = inputString.unescape();
3475 parseMatcher = parsePat->matcher(unEscapedInput, status);
3476 REGEX_CHECK_STATUS_L(line);
3477 while(parseMatcher->find()) {
3478 parseMatcher->appendReplacement(deTaggedInput, "", status);
3479 REGEX_CHECK_STATUS;
3480 UnicodeString groupNum = parseMatcher->group(2, status);
3481 if (groupNum == "r") {
3482 // <r> or </r>, a region specification within the string
3483 if (parseMatcher->group(1, status) == "/") {
3484 regionEnd = deTaggedInput.length();
3485 } else {
3486 regionStart = deTaggedInput.length();
3487 }
3488 } else {
3489 // <digits> or </digits>, a group match boundary tag.
3490 if (parseMatcher->group(1, status) == "/") {
3491 set(groupEnds, deTaggedInput.length(), groupNum);
3492 } else {
3493 set(groupStarts, deTaggedInput.length(), groupNum);
3494 }
3495 }
3496 }
3497 parseMatcher->appendTail(deTaggedInput);
3498 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3499 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3500 errln("mismatched <r> tags");
3501 failed = TRUE;
3502 goto cleanupAndReturn;
3503 }
3504
3505 //
3506 // Configure the matcher according to the flags specified with this test.
3507 //
3508 matcher = callerPattern->matcher(deTaggedInput, status);
3509 REGEX_CHECK_STATUS_L(line);
3510 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3511 matcher->setTrace(TRUE);
3512 }
3513
3514 if (UTF8Pattern != NULL) {
3515 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3516 status = U_ZERO_ERROR; // buffer overflow
3517 inputChars = new char[inputUTF8Length+1];
3518 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3519 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3520
3521 if (status == U_ZERO_ERROR) {
3522 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3523 REGEX_CHECK_STATUS_L(line);
3524 }
3525
3526 if (UTF8Matcher == NULL) {
3527 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3528 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3529 status = U_ZERO_ERROR;
3530 }
3531 }
3532
3533 //
3534 // Generate native indices for UTF8 versions of region and capture group info
3535 //
3536 if (UTF8Matcher != NULL) {
3537 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3538 UTF8Matcher->setTrace(TRUE);
3539 }
3540 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3541 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3542
3543 // Fill out the native index UVector info.
3544 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3545 for (i=0; i<groupStarts.size(); i++) {
3546 int32_t start = groupStarts.elementAti(i);
3547 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3548 if (start >= 0) {
3549 int32_t startUTF8;
3550 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3551 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3552 failed = TRUE;
3553 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3554 }
3555 setInt(groupStartsUTF8, startUTF8, i);
3556 }
3557
3558 int32_t end = groupEnds.elementAti(i);
3559 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3560 if (end >= 0) {
3561 int32_t endUTF8;
3562 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3563 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3564 failed = TRUE;
3565 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3566 }
3567 setInt(groupEndsUTF8, endUTF8, i);
3568 }
3569 }
3570 }
3571
3572 if (regionStart>=0) {
3573 matcher->region(regionStart, regionEnd, status);
3574 REGEX_CHECK_STATUS_L(line);
3575 if (UTF8Matcher != NULL) {
3576 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3577 REGEX_CHECK_STATUS_L(line);
3578 }
3579 }
3580 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3581 matcher->useAnchoringBounds(FALSE);
3582 if (UTF8Matcher != NULL) {
3583 UTF8Matcher->useAnchoringBounds(FALSE);
3584 }
3585 }
3586 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3587 matcher->useTransparentBounds(TRUE);
3588 if (UTF8Matcher != NULL) {
3589 UTF8Matcher->useTransparentBounds(TRUE);
3590 }
3591 }
3592
3593
3594
3595 //
3596 // Do a find on the de-tagged input using the caller's pattern
3597 // TODO: error on count>1 and not find().
3598 // error on both matches() and lookingAt().
3599 //
3600 for (i=0; i<numFinds; i++) {
3601 if (useMatchesFunc) {
3602 isMatch = matcher->matches(status);
3603 if (UTF8Matcher != NULL) {
3604 isUTF8Match = UTF8Matcher->matches(status);
3605 }
3606 } else if (useLookingAtFunc) {
3607 isMatch = matcher->lookingAt(status);
3608 if (UTF8Matcher != NULL) {
3609 isUTF8Match = UTF8Matcher->lookingAt(status);
3610 }
3611 } else {
3612 isMatch = matcher->find();
3613 if (UTF8Matcher != NULL) {
3614 isUTF8Match = UTF8Matcher->find();
3615 }
3616 }
3617 }
3618 matcher->setTrace(FALSE);
3619 if (UTF8Matcher) {
3620 UTF8Matcher->setTrace(FALSE);
3621 }
3622 if (U_FAILURE(status)) {
3623 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3624 }
3625
3626 //
3627 // Match up the groups from the find() with the groups from the tags
3628 //
3629
3630 // number of tags should match number of groups from find operation.
3631 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3632 // G option in test means that capture group data is not available in the
3633 // expected results, so the check needs to be suppressed.
3634 if (isMatch == FALSE && groupStarts.size() != 0) {
3635 dataerrln("Error at line %d: Match expected, but none found.", line);
3636 failed = TRUE;
3637 goto cleanupAndReturn;
3638 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3639 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3640 failed = TRUE;
3641 goto cleanupAndReturn;
3642 }
3643 if (isMatch && groupStarts.size() == 0) {
3644 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3645 failed = TRUE;
3646 }
3647 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3648 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3649 failed = TRUE;
3650 }
3651
3652 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3653 // Only check for match / no match. Don't check capture groups.
3654 goto cleanupAndReturn;
3655 }
3656
3657 REGEX_CHECK_STATUS_L(line);
3658 for (i=0; i<=matcher->groupCount(); i++) {
3659 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3660 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3661 if (matcher->start(i, status) != expectedStart) {
3662 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3663 line, i, expectedStart, matcher->start(i, status));
3664 failed = TRUE;
3665 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3666 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3667 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3668 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3669 failed = TRUE;
3670 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3671 }
3672
3673 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3674 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3675 if (matcher->end(i, status) != expectedEnd) {
3676 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3677 line, i, expectedEnd, matcher->end(i, status));
3678 failed = TRUE;
3679 // Error on end position; keep going; real error is probably yet to come as group
3680 // end positions work from end of the input data towards the front.
3681 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3682 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3683 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3684 failed = TRUE;
3685 // Error on end position; keep going; real error is probably yet to come as group
3686 // end positions work from end of the input data towards the front.
3687 }
3688 }
3689 if ( matcher->groupCount()+1 < groupStarts.size()) {
3690 errln("Error at line %d: Expected %d capture groups, found %d.",
3691 line, groupStarts.size()-1, matcher->groupCount());
3692 failed = TRUE;
3693 }
3694 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3695 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3696 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3697 failed = TRUE;
3698 }
3699
3700 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3701 matcher->requireEnd() == TRUE) {
3702 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3703 failed = TRUE;
3704 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3705 UTF8Matcher->requireEnd() == TRUE) {
3706 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3707 failed = TRUE;
3708 }
3709
3710 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3711 matcher->requireEnd() == FALSE) {
3712 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3713 failed = TRUE;
3714 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3715 UTF8Matcher->requireEnd() == FALSE) {
3716 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3717 failed = TRUE;
3718 }
3719
3720 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3721 matcher->hitEnd() == TRUE) {
3722 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3723 failed = TRUE;
3724 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3725 UTF8Matcher->hitEnd() == TRUE) {
3726 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3727 failed = TRUE;
3728 }
3729
3730 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3731 matcher->hitEnd() == FALSE) {
3732 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3733 failed = TRUE;
3734 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3735 UTF8Matcher->hitEnd() == FALSE) {
3736 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3737 failed = TRUE;
3738 }
3739
3740
3741 cleanupAndReturn:
3742 if (failed) {
3743 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3744 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3745 // callerPattern->dump();
3746 }
3747 delete parseMatcher;
3748 delete parsePat;
3749 delete UTF8Matcher;
3750 delete UTF8Pattern;
3751 delete matcher;
3752 delete callerPattern;
3753
3754 utext_close(&inputText);
3755 delete[] inputChars;
3756 utext_close(&patternText);
3757 delete[] patternChars;
3758 ucnv_close(UTF8Converter);
3759 }
3760
3761
3762
3763
3764 //---------------------------------------------------------------------------
3765 //
3766 // Errors Check for error handling in patterns.
3767 //
3768 //---------------------------------------------------------------------------
3769 void RegexTest::Errors() {
3770 // \escape sequences that aren't implemented yet.
3771 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3772
3773 // Missing close parentheses
3774 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3775 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3776 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3777
3778 // Extra close paren
3779 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3780 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3781 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3782
3783 // Look-ahead, Look-behind
3784 // TODO: add tests for unbounded length look-behinds.
3785 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3786
3787 // Attempt to use non-default flags
3788 {
3789 UParseError pe;
3790 UErrorCode status = U_ZERO_ERROR;
3791 int32_t flags = UREGEX_CANON_EQ |
3792 UREGEX_COMMENTS | UREGEX_DOTALL |
3793 UREGEX_MULTILINE;
3794 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3795 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3796 delete pat1;
3797 }
3798
3799
3800 // Quantifiers are allowed only after something that can be quantified.
3801 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3802 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3803 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3804
3805 // Mal-formed {min,max} quantifiers
3806 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3807 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3808 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3809 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3810 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3811 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3812 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3813 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3814 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3815
3816 // Ticket 5389
3817 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3818
3819 // Invalid Back Reference \0
3820 // For ICU 3.8 and earlier
3821 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3822 //
3823 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3824
3825 }
3826
3827
3828 //-------------------------------------------------------------------------------
3829 //
3830 // Read a text data file, convert it to UChars, and return the data
3831 // in one big UChar * buffer, which the caller must delete.
3832 //
3833 //--------------------------------------------------------------------------------
3834 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3835 const char *defEncoding, UErrorCode &status) {
3836 UChar *retPtr = NULL;
3837 char *fileBuf = NULL;
3838 UConverter* conv = NULL;
3839 FILE *f = NULL;
3840
3841 ulen = 0;
3842 if (U_FAILURE(status)) {
3843 return retPtr;
3844 }
3845
3846 //
3847 // Open the file.
3848 //
3849 f = fopen(fileName, "rb");
3850 if (f == 0) {
3851 dataerrln("Error opening test data file %s\n", fileName);
3852 status = U_FILE_ACCESS_ERROR;
3853 return NULL;
3854 }
3855 //
3856 // Read it in
3857 //
3858 int32_t fileSize;
3859 int32_t amt_read;
3860
3861 fseek( f, 0, SEEK_END);
3862 fileSize = ftell(f);
3863 fileBuf = new char[fileSize];
3864 fseek(f, 0, SEEK_SET);
3865 amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3866 if (amt_read != fileSize || fileSize <= 0) {
3867 errln("Error reading test data file.");
3868 goto cleanUpAndReturn;
3869 }
3870
3871 //
3872 // Look for a Unicode Signature (BOM) on the data just read
3873 //
3874 int32_t signatureLength;
3875 const char * fileBufC;
3876 const char* encoding;
3877
3878 fileBufC = fileBuf;
3879 encoding = ucnv_detectUnicodeSignature(
3880 fileBuf, fileSize, &signatureLength, &status);
3881 if(encoding!=NULL ){
3882 fileBufC += signatureLength;
3883 fileSize -= signatureLength;
3884 } else {
3885 encoding = defEncoding;
3886 if (strcmp(encoding, "utf-8") == 0) {
3887 errln("file %s is missing its BOM", fileName);
3888 }
3889 }
3890
3891 //
3892 // Open a converter to take the rule file to UTF-16
3893 //
3894 conv = ucnv_open(encoding, &status);
3895 if (U_FAILURE(status)) {
3896 goto cleanUpAndReturn;
3897 }
3898
3899 //
3900 // Convert the rules to UChar.
3901 // Preflight first to determine required buffer size.
3902 //
3903 ulen = ucnv_toUChars(conv,
3904 NULL, // dest,
3905 0, // destCapacity,
3906 fileBufC,
3907 fileSize,
3908 &status);
3909 if (status == U_BUFFER_OVERFLOW_ERROR) {
3910 // Buffer Overflow is expected from the preflight operation.
3911 status = U_ZERO_ERROR;
3912
3913 retPtr = new UChar[ulen+1];
3914 ucnv_toUChars(conv,
3915 retPtr, // dest,
3916 ulen+1,
3917 fileBufC,
3918 fileSize,
3919 &status);
3920 }
3921
3922 cleanUpAndReturn:
3923 fclose(f);
3924 delete[] fileBuf;
3925 ucnv_close(conv);
3926 if (U_FAILURE(status)) {
3927 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3928 delete []retPtr;
3929 retPtr = 0;
3930 ulen = 0;
3931 };
3932 return retPtr;
3933 }
3934
3935
3936 //-------------------------------------------------------------------------------
3937 //
3938 // PerlTests - Run Perl's regular expression tests
3939 // The input file for this test is re_tests, the standard regular
3940 // expression test data distributed with the Perl source code.
3941 //
3942 // Here is Perl's description of the test data file:
3943 //
3944 // # The tests are in a separate file 't/op/re_tests'.
3945 // # Each line in that file is a separate test.
3946 // # There are five columns, separated by tabs.
3947 // #
3948 // # Column 1 contains the pattern, optionally enclosed in C<''>.
3949 // # Modifiers can be put after the closing C<'>.
3950 // #
3951 // # Column 2 contains the string to be matched.
3952 // #
3953 // # Column 3 contains the expected result:
3954 // # y expect a match
3955 // # n expect no match
3956 // # c expect an error
3957 // # B test exposes a known bug in Perl, should be skipped
3958 // # b test exposes a known bug in Perl, should be skipped if noamp
3959 // #
3960 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3961 // #
3962 // # Column 4 contains a string, usually C<$&>.
3963 // #
3964 // # Column 5 contains the expected result of double-quote
3965 // # interpolating that string after the match, or start of error message.
3966 // #
3967 // # Column 6, if present, contains a reason why the test is skipped.
3968 // # This is printed with "skipped", for harness to pick up.
3969 // #
3970 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
3971 // #
3972 // # If you want to add a regular expression test that can't be expressed
3973 // # in this format, don't add it here: put it in op/pat.t instead.
3974 //
3975 // For ICU, if field 3 contains an 'i', the test will be skipped.
3976 // The test exposes is some known incompatibility between ICU and Perl regexps.
3977 // (The i is in addition to whatever was there before.)
3978 //
3979 //-------------------------------------------------------------------------------
3980 void RegexTest::PerlTests() {
3981 char tdd[2048];
3982 const char *srcPath;
3983 UErrorCode status = U_ZERO_ERROR;
3984 UParseError pe;
3985
3986 //
3987 // Open and read the test data file.
3988 //
3989 srcPath=getPath(tdd, "re_tests.txt");
3990 if(srcPath==NULL) {
3991 return; /* something went wrong, error already output */
3992 }
3993
3994 int32_t len;
3995 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3996 if (U_FAILURE(status)) {
3997 return; /* something went wrong, error already output */
3998 }
3999
4000 //
4001 // Put the test data into a UnicodeString
4002 //
4003 UnicodeString testDataString(FALSE, testData, len);
4004
4005 //
4006 // Regex to break the input file into lines, and strip the new lines.
4007 // One line per match, capture group one is the desired data.
4008 //
4009 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4010 if (U_FAILURE(status)) {
4011 dataerrln("RegexPattern::compile() error");
4012 return;
4013 }
4014 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4015
4016 //
4017 // Regex to split a test file line into fields.
4018 // There are six fields, separated by tabs.
4019 //
4020 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4021
4022 //
4023 // Regex to identify test patterns with flag settings, and to separate them.
4024 // Test patterns with flags look like 'pattern'i
4025 // Test patterns without flags are not quoted: pattern
4026 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4027 //
4028 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4029 RegexMatcher* flagMat = flagPat->matcher(status);
4030
4031 //
4032 // The Perl tests reference several perl-isms, which are evaluated/substituted
4033 // in the test data. Not being perl, this must be done explicitly. Here
4034 // are string constants and REs for these constructs.
4035 //
4036 UnicodeString nulnulSrc("${nulnul}");
4037 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4038 nulnul = nulnul.unescape();
4039
4040 UnicodeString ffffSrc("${ffff}");
4041 UnicodeString ffff("\\uffff", -1, US_INV);
4042 ffff = ffff.unescape();
4043
4044 // regexp for $-[0], $+[2], etc.
4045 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4046 RegexMatcher *groupsMat = groupsPat->matcher(status);
4047
4048 // regexp for $0, $1, $2, etc.
4049 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4050 RegexMatcher *cgMat = cgPat->matcher(status);
4051
4052
4053 //
4054 // Main Loop for the Perl Tests, runs once per line from the
4055 // test data file.
4056 //
4057 int32_t lineNum = 0;
4058 int32_t skippedUnimplementedCount = 0;
4059 while (lineMat->find()) {
4060 lineNum++;
4061
4062 //
4063 // Get a line, break it into its fields, do the Perl
4064 // variable substitutions.
4065 //
4066 UnicodeString line = lineMat->group(1, status);
4067 UnicodeString fields[7];
4068 fieldPat->split(line, fields, 7, status);
4069
4070 flagMat->reset(fields[0]);
4071 flagMat->matches(status);
4072 UnicodeString pattern = flagMat->group(2, status);
4073 pattern.findAndReplace("${bang}", "!");
4074 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4075 pattern.findAndReplace(ffffSrc, ffff);
4076
4077 //
4078 // Identify patterns that include match flag settings,
4079 // split off the flags, remove the extra quotes.
4080 //
4081 UnicodeString flagStr = flagMat->group(3, status);
4082 if (U_FAILURE(status)) {
4083 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4084 return;
4085 }
4086 int32_t flags = 0;
4087 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4088 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4089 const UChar UChar_m = 0x6d;
4090 const UChar UChar_x = 0x78;
4091 const UChar UChar_y = 0x79;
4092 if (flagStr.indexOf(UChar_i) != -1) {
4093 flags |= UREGEX_CASE_INSENSITIVE;
4094 }
4095 if (flagStr.indexOf(UChar_m) != -1) {
4096 flags |= UREGEX_MULTILINE;
4097 }
4098 if (flagStr.indexOf(UChar_x) != -1) {
4099 flags |= UREGEX_COMMENTS;
4100 }
4101
4102 //
4103 // Compile the test pattern.
4104 //
4105 status = U_ZERO_ERROR;
4106 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4107 if (status == U_REGEX_UNIMPLEMENTED) {
4108 //
4109 // Test of a feature that is planned for ICU, but not yet implemented.
4110 // skip the test.
4111 skippedUnimplementedCount++;
4112 delete testPat;
4113 status = U_ZERO_ERROR;
4114 continue;
4115 }
4116
4117 if (U_FAILURE(status)) {
4118 // Some tests are supposed to generate errors.
4119 // Only report an error for tests that are supposed to succeed.
4120 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4121 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4122 {
4123 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4124 }
4125 status = U_ZERO_ERROR;
4126 delete testPat;
4127 continue;
4128 }
4129
4130 if (fields[2].indexOf(UChar_i) >= 0) {
4131 // ICU should skip this test.
4132 delete testPat;
4133 continue;
4134 }
4135
4136 if (fields[2].indexOf(UChar_c) >= 0) {
4137 // This pattern should have caused a compilation error, but didn't/
4138 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4139 delete testPat;
4140 continue;
4141 }
4142
4143 //
4144 // replace the Perl variables that appear in some of the
4145 // match data strings.
4146 //
4147 UnicodeString matchString = fields[1];
4148 matchString.findAndReplace(nulnulSrc, nulnul);
4149 matchString.findAndReplace(ffffSrc, ffff);
4150
4151 // Replace any \n in the match string with an actual new-line char.
4152 // Don't do full unescape, as this unescapes more than Perl does, which
4153 // causes other spurious failures in the tests.
4154 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4155
4156
4157
4158 //
4159 // Run the test, check for expected match/don't match result.
4160 //
4161 RegexMatcher *testMat = testPat->matcher(matchString, status);
4162 UBool found = testMat->find();
4163 UBool expected = FALSE;
4164 if (fields[2].indexOf(UChar_y) >=0) {
4165 expected = TRUE;
4166 }
4167 if (expected != found) {
4168 errln("line %d: Expected %smatch, got %smatch",
4169 lineNum, expected?"":"no ", found?"":"no " );
4170 continue;
4171 }
4172
4173 // Don't try to check expected results if there is no match.
4174 // (Some have stuff in the expected fields)
4175 if (!found) {
4176 delete testMat;
4177 delete testPat;
4178 continue;
4179 }
4180
4181 //
4182 // Interpret the Perl expression from the fourth field of the data file,
4183 // building up an ICU string from the results of the ICU match.
4184 // The Perl expression will contain references to the results of
4185 // a regex match, including the matched string, capture group strings,
4186 // group starting and ending indicies, etc.
4187 //
4188 UnicodeString resultString;
4189 UnicodeString perlExpr = fields[3];
4190 #if SUPPORT_MUTATING_INPUT_STRING
4191 groupsMat->reset(perlExpr);
4192 cgMat->reset(perlExpr);
4193 #endif
4194
4195 while (perlExpr.length() > 0) {
4196 #if !SUPPORT_MUTATING_INPUT_STRING
4197 // Perferred usage. Reset after any modification to input string.
4198 groupsMat->reset(perlExpr);
4199 cgMat->reset(perlExpr);
4200 #endif
4201
4202 if (perlExpr.startsWith("$&")) {
4203 resultString.append(testMat->group(status));
4204 perlExpr.remove(0, 2);
4205 }
4206
4207 else if (groupsMat->lookingAt(status)) {
4208 // $-[0] $+[2] etc.
4209 UnicodeString digitString = groupsMat->group(2, status);
4210 int32_t t = 0;
4211 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4212 UnicodeString plusOrMinus = groupsMat->group(1, status);
4213 int32_t matchPosition;
4214 if (plusOrMinus.compare("+") == 0) {
4215 matchPosition = testMat->end(groupNum, status);
4216 } else {
4217 matchPosition = testMat->start(groupNum, status);
4218 }
4219 if (matchPosition != -1) {
4220 ICU_Utility::appendNumber(resultString, matchPosition);
4221 }
4222 perlExpr.remove(0, groupsMat->end(status));
4223 }
4224
4225 else if (cgMat->lookingAt(status)) {
4226 // $1, $2, $3, etc.
4227 UnicodeString digitString = cgMat->group(1, status);
4228 int32_t t = 0;
4229 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4230 if (U_SUCCESS(status)) {
4231 resultString.append(testMat->group(groupNum, status));
4232 status = U_ZERO_ERROR;
4233 }
4234 perlExpr.remove(0, cgMat->end(status));
4235 }
4236
4237 else if (perlExpr.startsWith("@-")) {
4238 int32_t i;
4239 for (i=0; i<=testMat->groupCount(); i++) {
4240 if (i>0) {
4241 resultString.append(" ");
4242 }
4243 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4244 }
4245 perlExpr.remove(0, 2);
4246 }
4247
4248 else if (perlExpr.startsWith("@+")) {
4249 int32_t i;
4250 for (i=0; i<=testMat->groupCount(); i++) {
4251 if (i>0) {
4252 resultString.append(" ");
4253 }
4254 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4255 }
4256 perlExpr.remove(0, 2);
4257 }
4258
4259 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4260 // or as an escaped sequence (e.g. \n)
4261 if (perlExpr.length() > 1) {
4262 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4263 }
4264 UChar c = perlExpr.charAt(0);
4265 switch (c) {
4266 case 'n': c = '\n'; break;
4267 // add any other escape sequences that show up in the test expected results.
4268 }
4269 resultString.append(c);
4270 perlExpr.remove(0, 1);
4271 }
4272
4273 else {
4274 // Any characters from the perl expression that we don't explicitly
4275 // recognize before here are assumed to be literals and copied
4276 // as-is to the expected results.
4277 resultString.append(perlExpr.charAt(0));
4278 perlExpr.remove(0, 1);
4279 }
4280
4281 if (U_FAILURE(status)) {
4282 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4283 break;
4284 }
4285 }
4286
4287 //
4288 // Expected Results Compare
4289 //
4290 UnicodeString expectedS(fields[4]);
4291 expectedS.findAndReplace(nulnulSrc, nulnul);
4292 expectedS.findAndReplace(ffffSrc, ffff);
4293 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4294
4295
4296 if (expectedS.compare(resultString) != 0) {
4297 err("Line %d: Incorrect perl expression results.", lineNum);
4298 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4299 }
4300
4301 delete testMat;
4302 delete testPat;
4303 }
4304
4305 //
4306 // All done. Clean up allocated stuff.
4307 //
4308 delete cgMat;
4309 delete cgPat;
4310
4311 delete groupsMat;
4312 delete groupsPat;
4313
4314 delete flagMat;
4315 delete flagPat;
4316
4317 delete lineMat;
4318 delete linePat;
4319
4320 delete fieldPat;
4321 delete [] testData;
4322
4323
4324 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4325
4326 }
4327
4328
4329 //-------------------------------------------------------------------------------
4330 //
4331 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4332 // (instead of using UnicodeStrings) to test the alternate engine.
4333 // The input file for this test is re_tests, the standard regular
4334 // expression test data distributed with the Perl source code.
4335 // See PerlTests() for more information.
4336 //
4337 //-------------------------------------------------------------------------------
4338 void RegexTest::PerlTestsUTF8() {
4339 char tdd[2048];
4340 const char *srcPath;
4341 UErrorCode status = U_ZERO_ERROR;
4342 UParseError pe;
4343 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4344 UText patternText = UTEXT_INITIALIZER;
4345 char *patternChars = NULL;
4346 int32_t patternLength;
4347 int32_t patternCapacity = 0;
4348 UText inputText = UTEXT_INITIALIZER;
4349 char *inputChars = NULL;
4350 int32_t inputLength;
4351 int32_t inputCapacity = 0;
4352
4353 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4354
4355 //
4356 // Open and read the test data file.
4357 //
4358 srcPath=getPath(tdd, "re_tests.txt");
4359 if(srcPath==NULL) {
4360 return; /* something went wrong, error already output */
4361 }
4362
4363 int32_t len;
4364 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4365 if (U_FAILURE(status)) {
4366 return; /* something went wrong, error already output */
4367 }
4368
4369 //
4370 // Put the test data into a UnicodeString
4371 //
4372 UnicodeString testDataString(FALSE, testData, len);
4373
4374 //
4375 // Regex to break the input file into lines, and strip the new lines.
4376 // One line per match, capture group one is the desired data.
4377 //
4378 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4379 if (U_FAILURE(status)) {
4380 dataerrln("RegexPattern::compile() error");
4381 return;
4382 }
4383 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4384
4385 //
4386 // Regex to split a test file line into fields.
4387 // There are six fields, separated by tabs.
4388 //
4389 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4390
4391 //
4392 // Regex to identify test patterns with flag settings, and to separate them.
4393 // Test patterns with flags look like 'pattern'i
4394 // Test patterns without flags are not quoted: pattern
4395 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4396 //
4397 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4398 RegexMatcher* flagMat = flagPat->matcher(status);
4399
4400 //
4401 // The Perl tests reference several perl-isms, which are evaluated/substituted
4402 // in the test data. Not being perl, this must be done explicitly. Here
4403 // are string constants and REs for these constructs.
4404 //
4405 UnicodeString nulnulSrc("${nulnul}");
4406 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4407 nulnul = nulnul.unescape();
4408
4409 UnicodeString ffffSrc("${ffff}");
4410 UnicodeString ffff("\\uffff", -1, US_INV);
4411 ffff = ffff.unescape();
4412
4413 // regexp for $-[0], $+[2], etc.
4414 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4415 RegexMatcher *groupsMat = groupsPat->matcher(status);
4416
4417 // regexp for $0, $1, $2, etc.
4418 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4419 RegexMatcher *cgMat = cgPat->matcher(status);
4420
4421
4422 //
4423 // Main Loop for the Perl Tests, runs once per line from the
4424 // test data file.
4425 //
4426 int32_t lineNum = 0;
4427 int32_t skippedUnimplementedCount = 0;
4428 while (lineMat->find()) {
4429 lineNum++;
4430
4431 //
4432 // Get a line, break it into its fields, do the Perl
4433 // variable substitutions.
4434 //
4435 UnicodeString line = lineMat->group(1, status);
4436 UnicodeString fields[7];
4437 fieldPat->split(line, fields, 7, status);
4438
4439 flagMat->reset(fields[0]);
4440 flagMat->matches(status);
4441 UnicodeString pattern = flagMat->group(2, status);
4442 pattern.findAndReplace("${bang}", "!");
4443 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4444 pattern.findAndReplace(ffffSrc, ffff);
4445
4446 //
4447 // Identify patterns that include match flag settings,
4448 // split off the flags, remove the extra quotes.
4449 //
4450 UnicodeString flagStr = flagMat->group(3, status);
4451 if (U_FAILURE(status)) {
4452 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4453 return;
4454 }
4455 int32_t flags = 0;
4456 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4457 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4458 const UChar UChar_m = 0x6d;
4459 const UChar UChar_x = 0x78;
4460 const UChar UChar_y = 0x79;
4461 if (flagStr.indexOf(UChar_i) != -1) {
4462 flags |= UREGEX_CASE_INSENSITIVE;
4463 }
4464 if (flagStr.indexOf(UChar_m) != -1) {
4465 flags |= UREGEX_MULTILINE;
4466 }
4467 if (flagStr.indexOf(UChar_x) != -1) {
4468 flags |= UREGEX_COMMENTS;
4469 }
4470
4471 //
4472 // Put the pattern in a UTF-8 UText
4473 //
4474 status = U_ZERO_ERROR;
4475 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4476 if (status == U_BUFFER_OVERFLOW_ERROR) {
4477 status = U_ZERO_ERROR;
4478 delete[] patternChars;
4479 patternCapacity = patternLength + 1;
4480 patternChars = new char[patternCapacity];
4481 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4482 }
4483 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4484
4485 //
4486 // Compile the test pattern.
4487 //
4488 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4489 if (status == U_REGEX_UNIMPLEMENTED) {
4490 //
4491 // Test of a feature that is planned for ICU, but not yet implemented.
4492 // skip the test.
4493 skippedUnimplementedCount++;
4494 delete testPat;
4495 status = U_ZERO_ERROR;
4496 continue;
4497 }
4498
4499 if (U_FAILURE(status)) {
4500 // Some tests are supposed to generate errors.
4501 // Only report an error for tests that are supposed to succeed.
4502 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4503 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4504 {
4505 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4506 }
4507 status = U_ZERO_ERROR;
4508 delete testPat;
4509 continue;
4510 }
4511
4512 if (fields[2].indexOf(UChar_i) >= 0) {
4513 // ICU should skip this test.
4514 delete testPat;
4515 continue;
4516 }
4517
4518 if (fields[2].indexOf(UChar_c) >= 0) {
4519 // This pattern should have caused a compilation error, but didn't/
4520 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4521 delete testPat;
4522 continue;
4523 }
4524
4525
4526 //
4527 // replace the Perl variables that appear in some of the
4528 // match data strings.
4529 //
4530 UnicodeString matchString = fields[1];
4531 matchString.findAndReplace(nulnulSrc, nulnul);
4532 matchString.findAndReplace(ffffSrc, ffff);
4533
4534 // Replace any \n in the match string with an actual new-line char.
4535 // Don't do full unescape, as this unescapes more than Perl does, which
4536 // causes other spurious failures in the tests.
4537 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4538
4539 //
4540 // Put the input in a UTF-8 UText
4541 //
4542 status = U_ZERO_ERROR;
4543 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4544 if (status == U_BUFFER_OVERFLOW_ERROR) {
4545 status = U_ZERO_ERROR;
4546 delete[] inputChars;
4547 inputCapacity = inputLength + 1;
4548 inputChars = new char[inputCapacity];
4549 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4550 }
4551 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4552
4553 //
4554 // Run the test, check for expected match/don't match result.
4555 //
4556 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4557 UBool found = testMat->find();
4558 UBool expected = FALSE;
4559 if (fields[2].indexOf(UChar_y) >=0) {
4560 expected = TRUE;
4561 }
4562 if (expected != found) {
4563 errln("line %d: Expected %smatch, got %smatch",
4564 lineNum, expected?"":"no ", found?"":"no " );
4565 continue;
4566 }
4567
4568 // Don't try to check expected results if there is no match.
4569 // (Some have stuff in the expected fields)
4570 if (!found) {
4571 delete testMat;
4572 delete testPat;
4573 continue;
4574 }
4575
4576 //
4577 // Interpret the Perl expression from the fourth field of the data file,
4578 // building up an ICU string from the results of the ICU match.
4579 // The Perl expression will contain references to the results of
4580 // a regex match, including the matched string, capture group strings,
4581 // group starting and ending indicies, etc.
4582 //
4583 UnicodeString resultString;
4584 UnicodeString perlExpr = fields[3];
4585
4586 while (perlExpr.length() > 0) {
4587 groupsMat->reset(perlExpr);
4588 cgMat->reset(perlExpr);
4589
4590 if (perlExpr.startsWith("$&")) {
4591 resultString.append(testMat->group(status));
4592 perlExpr.remove(0, 2);
4593 }
4594
4595 else if (groupsMat->lookingAt(status)) {
4596 // $-[0] $+[2] etc.
4597 UnicodeString digitString = groupsMat->group(2, status);
4598 int32_t t = 0;
4599 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4600 UnicodeString plusOrMinus = groupsMat->group(1, status);
4601 int32_t matchPosition;
4602 if (plusOrMinus.compare("+") == 0) {
4603 matchPosition = testMat->end(groupNum, status);
4604 } else {
4605 matchPosition = testMat->start(groupNum, status);
4606 }
4607 if (matchPosition != -1) {
4608 ICU_Utility::appendNumber(resultString, matchPosition);
4609 }
4610 perlExpr.remove(0, groupsMat->end(status));
4611 }
4612
4613 else if (cgMat->lookingAt(status)) {
4614 // $1, $2, $3, etc.
4615 UnicodeString digitString = cgMat->group(1, status);
4616 int32_t t = 0;
4617 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4618 if (U_SUCCESS(status)) {
4619 resultString.append(testMat->group(groupNum, status));
4620 status = U_ZERO_ERROR;
4621 }
4622 perlExpr.remove(0, cgMat->end(status));
4623 }
4624
4625 else if (perlExpr.startsWith("@-")) {
4626 int32_t i;
4627 for (i=0; i<=testMat->groupCount(); i++) {
4628 if (i>0) {
4629 resultString.append(" ");
4630 }
4631 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4632 }
4633 perlExpr.remove(0, 2);
4634 }
4635
4636 else if (perlExpr.startsWith("@+")) {
4637 int32_t i;
4638 for (i=0; i<=testMat->groupCount(); i++) {
4639 if (i>0) {
4640 resultString.append(" ");
4641 }
4642 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4643 }
4644 perlExpr.remove(0, 2);
4645 }
4646
4647 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4648 // or as an escaped sequence (e.g. \n)
4649 if (perlExpr.length() > 1) {
4650 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4651 }
4652 UChar c = perlExpr.charAt(0);
4653 switch (c) {
4654 case 'n': c = '\n'; break;
4655 // add any other escape sequences that show up in the test expected results.
4656 }
4657 resultString.append(c);
4658 perlExpr.remove(0, 1);
4659 }
4660
4661 else {
4662 // Any characters from the perl expression that we don't explicitly
4663 // recognize before here are assumed to be literals and copied
4664 // as-is to the expected results.
4665 resultString.append(perlExpr.charAt(0));
4666 perlExpr.remove(0, 1);
4667 }
4668
4669 if (U_FAILURE(status)) {
4670 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4671 break;
4672 }
4673 }
4674
4675 //
4676 // Expected Results Compare
4677 //
4678 UnicodeString expectedS(fields[4]);
4679 expectedS.findAndReplace(nulnulSrc, nulnul);
4680 expectedS.findAndReplace(ffffSrc, ffff);
4681 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4682
4683
4684 if (expectedS.compare(resultString) != 0) {
4685 err("Line %d: Incorrect perl expression results.", lineNum);
4686 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4687 }
4688
4689 delete testMat;
4690 delete testPat;
4691 }
4692
4693 //
4694 // All done. Clean up allocated stuff.
4695 //
4696 delete cgMat;
4697 delete cgPat;
4698
4699 delete groupsMat;
4700 delete groupsPat;
4701
4702 delete flagMat;
4703 delete flagPat;
4704
4705 delete lineMat;
4706 delete linePat;
4707
4708 delete fieldPat;
4709 delete [] testData;
4710
4711 utext_close(&patternText);
4712 utext_close(&inputText);
4713
4714 delete [] patternChars;
4715 delete [] inputChars;
4716
4717
4718 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4719
4720 }
4721
4722
4723 //--------------------------------------------------------------
4724 //
4725 // Bug6149 Verify limits to heap expansion for backtrack stack.
4726 // Use this pattern,
4727 // "(a?){1,8000000}"
4728 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4729 // This test is likely to be fragile, as further optimizations stop
4730 // more cases of pointless looping in the match engine.
4731 //
4732 //---------------------------------------------------------------
4733 void RegexTest::Bug6149() {
4734 UnicodeString pattern("(a?){1,8000000}");
4735 UnicodeString s("xyz");
4736 uint32_t flags = 0;
4737 UErrorCode status = U_ZERO_ERROR;
4738
4739 RegexMatcher matcher(pattern, s, flags, status);
4740 UBool result = false;
4741 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4742 REGEX_ASSERT(result == FALSE);
4743 }
4744
4745
4746 //
4747 // Callbacks() Test the callback function.
4748 // When set, callbacks occur periodically during matching operations,
4749 // giving the application code the ability to abort the operation
4750 // before it's normal completion.
4751 //
4752
4753 struct callBackContext {
4754 RegexTest *test;
4755 int32_t maxCalls;
4756 int32_t numCalls;
4757 int32_t lastSteps;
4758 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4759 };
4760
4761 U_CDECL_BEGIN
4762 static UBool U_CALLCONV
4763 testCallBackFn(const void *context, int32_t steps) {
4764 callBackContext *info = (callBackContext *)context;
4765 if (info->lastSteps+1 != steps) {
4766 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4767 }
4768 info->lastSteps = steps;
4769 info->numCalls++;
4770 return (info->numCalls < info->maxCalls);
4771 }
4772 U_CDECL_END
4773
4774 void RegexTest::Callbacks() {
4775 {
4776 // Getter returns NULLs if no callback has been set
4777
4778 // The variables that the getter will fill in.
4779 // Init to non-null values so that the action of the getter can be seen.
4780 const void *returnedContext = &returnedContext;
4781 URegexMatchCallback *returnedFn = &testCallBackFn;
4782
4783 UErrorCode status = U_ZERO_ERROR;
4784 RegexMatcher matcher("x", 0, status);
4785 REGEX_CHECK_STATUS;
4786 matcher.getMatchCallback(returnedFn, returnedContext, status);
4787 REGEX_CHECK_STATUS;
4788 REGEX_ASSERT(returnedFn == NULL);
4789 REGEX_ASSERT(returnedContext == NULL);
4790 }
4791
4792 {
4793 // Set and Get work
4794 callBackContext cbInfo = {this, 0, 0, 0};
4795 const void *returnedContext;
4796 URegexMatchCallback *returnedFn;
4797 UErrorCode status = U_ZERO_ERROR;
4798 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4799 REGEX_CHECK_STATUS;
4800 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4801 REGEX_CHECK_STATUS;
4802 matcher.getMatchCallback(returnedFn, returnedContext, status);
4803 REGEX_CHECK_STATUS;
4804 REGEX_ASSERT(returnedFn == testCallBackFn);
4805 REGEX_ASSERT(returnedContext == &cbInfo);
4806
4807 // A short-running match shouldn't invoke the callback
4808 status = U_ZERO_ERROR;
4809 cbInfo.reset(1);
4810 UnicodeString s = "xxx";
4811 matcher.reset(s);
4812 REGEX_ASSERT(matcher.matches(status));
4813 REGEX_CHECK_STATUS;
4814 REGEX_ASSERT(cbInfo.numCalls == 0);
4815
4816 // A medium-length match that runs long enough to invoke the
4817 // callback, but not so long that the callback aborts it.
4818 status = U_ZERO_ERROR;
4819 cbInfo.reset(4);
4820 s = "aaaaaaaaaaaaaaaaaaab";
4821 matcher.reset(s);
4822 REGEX_ASSERT(matcher.matches(status)==FALSE);
4823 REGEX_CHECK_STATUS;
4824 REGEX_ASSERT(cbInfo.numCalls > 0);
4825
4826 // A longer running match that the callback function will abort.
4827 status = U_ZERO_ERROR;
4828 cbInfo.reset(4);
4829 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4830 matcher.reset(s);
4831 REGEX_ASSERT(matcher.matches(status)==FALSE);
4832 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4833 REGEX_ASSERT(cbInfo.numCalls == 4);
4834
4835 // A longer running find that the callback function will abort.
4836 status = U_ZERO_ERROR;
4837 cbInfo.reset(4);
4838 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4839 matcher.reset(s);
4840 REGEX_ASSERT(matcher.find(status)==FALSE);
4841 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4842 REGEX_ASSERT(cbInfo.numCalls == 4);
4843 }
4844
4845
4846 }
4847
4848
4849 //
4850 // FindProgressCallbacks() Test the find "progress" callback function.
4851 // When set, the find progress callback will be invoked during a find operations
4852 // after each return from a match attempt, giving the application the opportunity
4853 // to terminate a long-running find operation before it's normal completion.
4854 //
4855
4856 struct progressCallBackContext {
4857 RegexTest *test;
4858 int64_t lastIndex;
4859 int32_t maxCalls;
4860 int32_t numCalls;
4861 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4862 };
4863
4864 // call-back function for find().
4865 // Return TRUE to continue the find().
4866 // Return FALSE to stop the find().
4867 U_CDECL_BEGIN
4868 static UBool U_CALLCONV
4869 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4870 progressCallBackContext *info = (progressCallBackContext *)context;
4871 info->numCalls++;
4872 info->lastIndex = matchIndex;
4873 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4874 return (info->numCalls < info->maxCalls);
4875 }
4876 U_CDECL_END
4877
4878 void RegexTest::FindProgressCallbacks() {
4879 {
4880 // Getter returns NULLs if no callback has been set
4881
4882 // The variables that the getter will fill in.
4883 // Init to non-null values so that the action of the getter can be seen.
4884 const void *returnedContext = &returnedContext;
4885 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4886
4887 UErrorCode status = U_ZERO_ERROR;
4888 RegexMatcher matcher("x", 0, status);
4889 REGEX_CHECK_STATUS;
4890 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4891 REGEX_CHECK_STATUS;
4892 REGEX_ASSERT(returnedFn == NULL);
4893 REGEX_ASSERT(returnedContext == NULL);
4894 }
4895
4896 {
4897 // Set and Get work
4898 progressCallBackContext cbInfo = {this, 0, 0, 0};
4899 const void *returnedContext;
4900 URegexFindProgressCallback *returnedFn;
4901 UErrorCode status = U_ZERO_ERROR;
4902 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4903 REGEX_CHECK_STATUS;
4904 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4905 REGEX_CHECK_STATUS;
4906 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4907 REGEX_CHECK_STATUS;
4908 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4909 REGEX_ASSERT(returnedContext == &cbInfo);
4910
4911 // A find that matches on the initial position does NOT invoke the callback.
4912 status = U_ZERO_ERROR;
4913 cbInfo.reset(100);
4914 UnicodeString s = "aaxxx";
4915 matcher.reset(s);
4916 #if 0
4917 matcher.setTrace(TRUE);
4918 #endif
4919 REGEX_ASSERT(matcher.find(0, status));
4920 REGEX_CHECK_STATUS;
4921 REGEX_ASSERT(cbInfo.numCalls == 0);
4922
4923 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4924 // but not so many times that we interrupt the operation.
4925 status = U_ZERO_ERROR;
4926 s = "aaaaaaaaaaaaaaaaaaab";
4927 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4928 matcher.reset(s);
4929 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4930 REGEX_CHECK_STATUS;
4931 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4932
4933 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4934 status = U_ZERO_ERROR;
4935 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4936 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4937 matcher.reset(s1);
4938 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4939 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4940 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4941
4942 // Now a match that will succeed, but after an interruption
4943 status = U_ZERO_ERROR;
4944 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4945 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4946 matcher.reset(s2);
4947 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4948 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4949 // Now retry the match from where left off
4950 cbInfo.maxCalls = 100; // No callback limit
4951 status = U_ZERO_ERROR;
4952 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4953 REGEX_CHECK_STATUS;
4954 }
4955
4956
4957 }
4958
4959
4960 //---------------------------------------------------------------------------
4961 //
4962 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4963 // UTexts. The pure-C implementation of UText
4964 // has no mutable backing stores, but we can
4965 // use UnicodeString here to test the functionality.
4966 //
4967 //---------------------------------------------------------------------------
4968 void RegexTest::PreAllocatedUTextCAPI () {
4969 UErrorCode status = U_ZERO_ERROR;
4970 URegularExpression *re;
4971 UText patternText = UTEXT_INITIALIZER;
4972 UnicodeString buffer;
4973 UText bufferText = UTEXT_INITIALIZER;
4974
4975 utext_openUnicodeString(&bufferText, &buffer, &status);
4976
4977 /*
4978 * getText() and getUText()
4979 */
4980 {
4981 UText text1 = UTEXT_INITIALIZER;
4982 UText text2 = UTEXT_INITIALIZER;
4983 UChar text2Chars[20];
4984 UText *resultText;
4985
4986 status = U_ZERO_ERROR;
4987 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4988 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4989 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4990 utext_openUChars(&text2, text2Chars, -1, &status);
4991
4992 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4993 re = uregex_openUText(&patternText, 0, NULL, &status);
4994
4995 /* First set a UText */
4996 uregex_setUText(re, &text1, &status);
4997 resultText = uregex_getUText(re, &bufferText, &status);
4998 REGEX_CHECK_STATUS;
4999 REGEX_ASSERT(resultText == &bufferText);
5000 utext_setNativeIndex(resultText, 0);
5001 utext_setNativeIndex(&text1, 0);
5002 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5003
5004 resultText = uregex_getUText(re, &bufferText, &status);
5005 REGEX_CHECK_STATUS;
5006 REGEX_ASSERT(resultText == &bufferText);
5007 utext_setNativeIndex(resultText, 0);
5008 utext_setNativeIndex(&text1, 0);
5009 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5010
5011 /* Then set a UChar * */
5012 uregex_setText(re, text2Chars, 7, &status);
5013 resultText = uregex_getUText(re, &bufferText, &status);
5014 REGEX_CHECK_STATUS;
5015 REGEX_ASSERT(resultText == &bufferText);
5016 utext_setNativeIndex(resultText, 0);
5017 utext_setNativeIndex(&text2, 0);
5018 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5019
5020 uregex_close(re);
5021 utext_close(&text1);
5022 utext_close(&text2);
5023 }
5024
5025 /*
5026 * group()
5027 */
5028 {
5029 UChar text1[80];
5030 UText *actual;
5031 UBool result;
5032 int64_t length = 0;
5033
5034 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5035 // 012345678901234567890123456789012345678901234567
5036 // 0 1 2 3 4
5037
5038 status = U_ZERO_ERROR;
5039 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5040 REGEX_CHECK_STATUS;
5041
5042 uregex_setText(re, text1, -1, &status);
5043 result = uregex_find(re, 0, &status);
5044 REGEX_ASSERT(result==TRUE);
5045
5046 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5047 status = U_ZERO_ERROR;
5048 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5049 REGEX_CHECK_STATUS;
5050 REGEX_ASSERT(actual == &bufferText);
5051 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5052 REGEX_ASSERT(length == 16);
5053 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5054
5055 /* Capture group #1. Should succeed, matching " interior ". */
5056 status = U_ZERO_ERROR;
5057 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5058 REGEX_CHECK_STATUS;
5059 REGEX_ASSERT(actual == &bufferText);
5060 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5061 REGEX_ASSERT(length == 10);
5062 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5063
5064 /* Capture group out of range. Error. */
5065 status = U_ZERO_ERROR;
5066 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5067 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5068 REGEX_ASSERT(actual == &bufferText);
5069 uregex_close(re);
5070
5071 }
5072
5073 /*
5074 * replaceFirst()
5075 */
5076 {
5077 UChar text1[80];
5078 UChar text2[80];
5079 UText replText = UTEXT_INITIALIZER;
5080 UText *result;
5081 status = U_ZERO_ERROR;
5082 utext_openUnicodeString(&bufferText, &buffer, &status);
5083
5084 status = U_ZERO_ERROR;
5085 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5086 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5087 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5088
5089 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5090 REGEX_CHECK_STATUS;
5091
5092 /* Normal case, with match */
5093 uregex_setText(re, text1, -1, &status);
5094 REGEX_CHECK_STATUS;
5095 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5096 REGEX_CHECK_STATUS;
5097 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5098 REGEX_CHECK_STATUS;
5099 REGEX_ASSERT(result == &bufferText);
5100 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5101
5102 /* No match. Text should copy to output with no changes. */
5103 uregex_setText(re, text2, -1, &status);
5104 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5105 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5106 REGEX_CHECK_STATUS;
5107 REGEX_ASSERT(result == &bufferText);
5108 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5109
5110 /* Unicode escapes */
5111 uregex_setText(re, text1, -1, &status);
5112 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5113 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5114 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5115 REGEX_CHECK_STATUS;
5116 REGEX_ASSERT(result == &bufferText);
5117 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5118
5119 uregex_close(re);
5120 utext_close(&replText);
5121 }
5122
5123
5124 /*
5125 * replaceAll()
5126 */
5127 {
5128 UChar text1[80];
5129 UChar text2[80];
5130 UText replText = UTEXT_INITIALIZER;
5131 UText *result;
5132
5133 status = U_ZERO_ERROR;
5134 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5135 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5136 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5137
5138 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5139 REGEX_CHECK_STATUS;
5140
5141 /* Normal case, with match */
5142 uregex_setText(re, text1, -1, &status);
5143 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5144 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5145 REGEX_CHECK_STATUS;
5146 REGEX_ASSERT(result == &bufferText);
5147 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5148
5149 /* No match. Text should copy to output with no changes. */
5150 uregex_setText(re, text2, -1, &status);
5151 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5152 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5153 REGEX_CHECK_STATUS;
5154 REGEX_ASSERT(result == &bufferText);
5155 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5156
5157 uregex_close(re);
5158 utext_close(&replText);
5159 }
5160
5161
5162 /*
5163 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5164 * so we don't need to test it here.
5165 */
5166
5167 utext_close(&bufferText);
5168 utext_close(&patternText);
5169 }
5170
5171
5172 //--------------------------------------------------------------
5173 //
5174 // NamedCapture Check basic named capture group functionality
5175 //
5176 //--------------------------------------------------------------
5177 void RegexTest::NamedCapture() {
5178 UErrorCode status = U_ZERO_ERROR;
5179 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5180 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5181 REGEX_CHECK_STATUS;
5182 int32_t group = pat->groupNumberFromName("five", -1, status);
5183 REGEX_CHECK_STATUS;
5184 REGEX_ASSERT(5 == group);
5185 group = pat->groupNumberFromName("three", -1, status);
5186 REGEX_CHECK_STATUS;
5187 REGEX_ASSERT(3 == group);
5188
5189 status = U_ZERO_ERROR;
5190 group = pat->groupNumberFromName(UnicodeString("six"), status);
5191 REGEX_CHECK_STATUS;
5192 REGEX_ASSERT(6 == group);
5193
5194 status = U_ZERO_ERROR;
5195 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5196 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5197
5198 status = U_ZERO_ERROR;
5199
5200 // After copying a pattern, named capture should still work in the copy.
5201 RegexPattern *copiedPat = new RegexPattern(*pat);
5202 REGEX_ASSERT(*copiedPat == *pat);
5203 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5204
5205 group = copiedPat->groupNumberFromName("five", -1, status);
5206 REGEX_CHECK_STATUS;
5207 REGEX_ASSERT(5 == group);
5208 group = copiedPat->groupNumberFromName("three", -1, status);
5209 REGEX_CHECK_STATUS;
5210 REGEX_ASSERT(3 == group);
5211 delete copiedPat;
5212
5213 // ReplaceAll with named capture group.
5214 status = U_ZERO_ERROR;
5215 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5216 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5217 REGEX_CHECK_STATUS;
5218 // m.pattern().dumpPattern();
5219 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5220 REGEX_CHECK_STATUS;
5221 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5222 delete m;
5223
5224 // ReplaceAll, allowed capture group numbers.
5225 text = UnicodeString("abcmxyz");
5226 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5227 REGEX_CHECK_STATUS;
5228
5229 status = U_ZERO_ERROR;
5230 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5231 REGEX_CHECK_STATUS;
5232 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5233
5234 status = U_ZERO_ERROR;
5235 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5236 REGEX_CHECK_STATUS;
5237 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5238
5239 status = U_ZERO_ERROR;
5240 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5241 REGEX_CHECK_STATUS;
5242 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5243
5244 status = U_ZERO_ERROR;
5245 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5246 REGEX_CHECK_STATUS;
5247 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5248
5249 status = U_ZERO_ERROR;
5250 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5251 REGEX_CHECK_STATUS;
5252 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5253
5254 status = U_ZERO_ERROR;
5255 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5256 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5257
5258 status = U_ZERO_ERROR;
5259 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5260 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5261 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5262
5263 status = U_ZERO_ERROR;
5264 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5265 REGEX_CHECK_STATUS; // that push group num out of range.
5266 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5267
5268 status = U_ZERO_ERROR;
5269 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5270 REGEX_CHECK_STATUS;
5271 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5272
5273 status = U_ZERO_ERROR;
5274 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5275 REGEX_CHECK_STATUS;
5276 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5277
5278 status = U_ZERO_ERROR;
5279 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5280 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5281
5282 status = U_ZERO_ERROR;
5283 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5284 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5285
5286 status = U_ZERO_ERROR;
5287 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5288 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5289
5290 status = U_ZERO_ERROR;
5291 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5292 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5293
5294 delete m;
5295
5296 // Repeat the above replaceAll() tests using the plain C API, which
5297 // has a separate implementation internally.
5298 // TODO: factor out the test data.
5299
5300 status = U_ZERO_ERROR;
5301 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5302 REGEX_CHECK_STATUS;
5303 text = UnicodeString("abcmxyz");
5304 uregex_setText(re, text.getBuffer(), text.length(), &status);
5305 REGEX_CHECK_STATUS;
5306
5307 UChar resultBuf[100];
5308 int32_t resultLength;
5309 UnicodeString repl;
5310
5311 status = U_ZERO_ERROR;
5312 repl = UnicodeString("<$0>");
5313 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5314 REGEX_CHECK_STATUS;
5315 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5316
5317 status = U_ZERO_ERROR;
5318 repl = UnicodeString("<$1>");
5319 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5320 REGEX_CHECK_STATUS;
5321 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5322
5323 status = U_ZERO_ERROR;
5324 repl = UnicodeString("<${one}>");
5325 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5326 REGEX_CHECK_STATUS;
5327 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5328
5329 status = U_ZERO_ERROR;
5330 repl = UnicodeString("<$2>");
5331 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5332 REGEX_CHECK_STATUS;
5333 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5334
5335 status = U_ZERO_ERROR;
5336 repl = UnicodeString("<$3>");
5337 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5338 REGEX_CHECK_STATUS;
5339 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5340
5341 status = U_ZERO_ERROR;
5342 repl = UnicodeString("<$4>");
5343 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5344 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5345
5346 status = U_ZERO_ERROR;
5347 repl = UnicodeString("<$04>");
5348 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5349 REGEX_CHECK_STATUS;
5350 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5351
5352 status = U_ZERO_ERROR;
5353 repl = UnicodeString("<$000016>");
5354 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5355 REGEX_CHECK_STATUS;
5356 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5357
5358 status = U_ZERO_ERROR;
5359 repl = UnicodeString("<$3$2$1${one}>");
5360 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5361 REGEX_CHECK_STATUS;
5362 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5363
5364 status = U_ZERO_ERROR;
5365 repl = UnicodeString("$3$2$1${one}");
5366 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5367 REGEX_CHECK_STATUS;
5368 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5369
5370 status = U_ZERO_ERROR;
5371 repl = UnicodeString("<${noSuchName}>");
5372 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5373 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5374
5375 status = U_ZERO_ERROR;
5376 repl = UnicodeString("<${invalid-name}>");
5377 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5378 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5379
5380 status = U_ZERO_ERROR;
5381 repl = UnicodeString("<${one");
5382 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5384
5385 status = U_ZERO_ERROR;
5386 repl = UnicodeString("$not a capture group");
5387 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5388 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5389
5390 uregex_close(re);
5391 }
5392
5393 //--------------------------------------------------------------
5394 //
5395 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5396 // The point is not so much what the exact limit is,
5397 // but that a largish number doesn't hit bad non-linear performance,
5398 // and that exceeding the limit fails cleanly.
5399 //
5400 //--------------------------------------------------------------
5401 void RegexTest::NamedCaptureLimits() {
5402 if (quick) {
5403 logln("Skipping test. Runs in exhuastive mode only.");
5404 return;
5405 }
5406 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5407 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5408 char nnbuf[100];
5409 UnicodeString pattern;
5410 int32_t nn;
5411
5412 for (nn=1; nn<goodLimit; nn++) {
5413 sprintf(nnbuf, "(?<nn%d>)", nn);
5414 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5415 }
5416 UErrorCode status = U_ZERO_ERROR;
5417 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5418 REGEX_CHECK_STATUS;
5419 for (nn=1; nn<goodLimit; nn++) {
5420 sprintf(nnbuf, "nn%d", nn);
5421 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5422 REGEX_ASSERT(nn == groupNum);
5423 if (nn != groupNum) {
5424 break;
5425 }
5426 }
5427 delete pat;
5428
5429 pattern.remove();
5430 for (nn=1; nn<failLimit; nn++) {
5431 sprintf(nnbuf, "(?<nn%d>)", nn);
5432 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5433 }
5434 status = U_ZERO_ERROR;
5435 pat = RegexPattern::compile(pattern, 0, status);
5436 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5437 delete pat;
5438 }
5439
5440
5441 //--------------------------------------------------------------
5442 //
5443 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5444 //
5445 //---------------------------------------------------------------
5446 void RegexTest::Bug7651() {
5447 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5448 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5449 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5450 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5451 UnicodeString s("#ff @abcd This is test");
5452 RegexPattern *REPattern = NULL;
5453 RegexMatcher *REMatcher = NULL;
5454 UErrorCode status = U_ZERO_ERROR;
5455 UParseError pe;
5456
5457 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5458 REGEX_CHECK_STATUS;
5459 REMatcher = REPattern->matcher(s, status);
5460 REGEX_CHECK_STATUS;
5461 REGEX_ASSERT(REMatcher->find());
5462 REGEX_ASSERT(REMatcher->start(status) == 0);
5463 delete REPattern;
5464 delete REMatcher;
5465 status = U_ZERO_ERROR;
5466
5467 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5468 REGEX_CHECK_STATUS;
5469 REMatcher = REPattern->matcher(s, status);
5470 REGEX_CHECK_STATUS;
5471 REGEX_ASSERT(REMatcher->find());
5472 REGEX_ASSERT(REMatcher->start(status) == 0);
5473 delete REPattern;
5474 delete REMatcher;
5475 status = U_ZERO_ERROR;
5476 }
5477
5478 void RegexTest::Bug7740() {
5479 UErrorCode status = U_ZERO_ERROR;
5480 UnicodeString pattern = "(a)";
5481 UnicodeString text = "abcdef";
5482 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5483 REGEX_CHECK_STATUS;
5484 REGEX_ASSERT(m->lookingAt(status));
5485 REGEX_CHECK_STATUS;
5486 status = U_ILLEGAL_ARGUMENT_ERROR;
5487 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5488 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5489 REGEX_ASSERT(s == "");
5490 delete m;
5491 }
5492
5493 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5494
5495 void RegexTest::Bug8479() {
5496 UErrorCode status = U_ZERO_ERROR;
5497
5498 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5499 REGEX_CHECK_STATUS;
5500 if (U_SUCCESS(status))
5501 {
5502 UnicodeString str;
5503 str.setToBogus();
5504 pMatcher->reset(str);
5505 status = U_ZERO_ERROR;
5506 pMatcher->matches(status);
5507 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5508 delete pMatcher;
5509 }
5510 }
5511
5512
5513 // Bug 7029
5514 void RegexTest::Bug7029() {
5515 UErrorCode status = U_ZERO_ERROR;
5516
5517 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5518 UnicodeString text = "abc.def";
5519 UnicodeString splits[10];
5520 REGEX_CHECK_STATUS;
5521 int32_t numFields = pMatcher->split(text, splits, 10, status);
5522 REGEX_CHECK_STATUS;
5523 REGEX_ASSERT(numFields == 8);
5524 delete pMatcher;
5525 }
5526
5527 // Bug 9283
5528 // This test is checking for the existance of any supplemental characters that case-fold
5529 // to a bmp character.
5530 //
5531 // At the time of this writing there are none. If any should appear in a subsequent release
5532 // of Unicode, the code in regular expressions compilation that determines the longest
5533 // posssible match for a literal string will need to be enhanced.
5534 //
5535 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5536 // for details on what to do in case of a failure of this test.
5537 //
5538 void RegexTest::Bug9283() {
5539 #if !UCONFIG_NO_NORMALIZATION
5540 UErrorCode status = U_ZERO_ERROR;
5541 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5542 REGEX_CHECK_STATUS;
5543 int32_t index;
5544 UChar32 c;
5545 for (index=0; ; index++) {
5546 c = supplementalsWithCaseFolding.charAt(index);
5547 if (c == -1) {
5548 break;
5549 }
5550 UnicodeString cf = UnicodeString(c).foldCase();
5551 REGEX_ASSERT(cf.length() >= 2);
5552 }
5553 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5554 }
5555
5556
5557 void RegexTest::CheckInvBufSize() {
5558 if(inv_next>=INV_BUFSIZ) {
5559 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5560 __FILE__, INV_BUFSIZ, inv_next);
5561 } else {
5562 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5563 }
5564 }
5565
5566
5567 void RegexTest::Bug10459() {
5568 UErrorCode status = U_ZERO_ERROR;
5569 UnicodeString patternString("(txt)");
5570 UnicodeString txtString("txt");
5571
5572 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5573 REGEX_CHECK_STATUS;
5574 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5575 REGEX_CHECK_STATUS;
5576
5577 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5578 REGEX_CHECK_STATUS;
5579
5580 uregex_setUText(icu_re, utext_txt, &status);
5581 REGEX_CHECK_STATUS;
5582
5583 // The bug was that calling uregex_group() before doing a matching operation
5584 // was causing a segfault. Only for Regular Expressions created from UText.
5585 // It should set an U_REGEX_INVALID_STATE.
5586
5587 UChar buf[100];
5588 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5589 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5590 REGEX_ASSERT(len == 0);
5591
5592 uregex_close(icu_re);
5593 utext_close(utext_pat);
5594 utext_close(utext_txt);
5595 }
5596
5597 void RegexTest::TestCaseInsensitiveStarters() {
5598 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5599 // become stale because of new Unicode characters.
5600 // If it is stale, rerun the generation tool
5601 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5602 // and replace the embedded data in i18n/regexcmp.cpp
5603
5604 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5605 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5606 continue;
5607 }
5608 UnicodeSet s(cp, cp);
5609 s.closeOver(USET_CASE_INSENSITIVE);
5610 UnicodeSetIterator setIter(s);
5611 while (setIter.next()) {
5612 if (!setIter.isString()) {
5613 continue;
5614 }
5615 const UnicodeString &str = setIter.getString();
5616 UChar32 firstChar = str.char32At(0);
5617 UnicodeSet starters;
5618 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5619 if (!starters.contains(cp)) {
5620 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5621 return;
5622 }
5623 }
5624 }
5625 }
5626
5627
5628 void RegexTest::TestBug11049() {
5629 // Original bug report: pattern with match start consisting of one of several individual characters,
5630 // and the text being matched ending with a supplementary character. find() would read past the
5631 // end of the input text when searching for potential match starting points.
5632
5633 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5634 // detect the bad read.
5635
5636 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5637 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5638
5639 // Test again with a pattern starting with a single character,
5640 // which takes a different code path than starting with an OR expression,
5641 // but with similar logic.
5642 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5643 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5644 }
5645
5646 // Run a single test case from TestBug11049(). Internal function.
5647 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5648 UErrorCode status = U_ZERO_ERROR;
5649 UnicodeString patternString = UnicodeString(pattern).unescape();
5650 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5651
5652 UnicodeString dataString = UnicodeString(data).unescape();
5653 UChar *exactBuffer = new UChar[dataString.length()];
5654 dataString.extract(exactBuffer, dataString.length(), status);
5655 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5656
5657 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5658 REGEX_CHECK_STATUS;
5659 matcher->reset(ut);
5660 UBool result = matcher->find();
5661 if (result != expectMatch) {
5662 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5663 __FILE__, lineNumber, expectMatch, result, pattern, data);
5664 }
5665
5666 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5667 // off-by-one on find() with match at the last code point.
5668 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5669 // because string.unescape() will only shrink it.
5670 char * utf8Buffer = new char[uprv_strlen(data)+1];
5671 u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5672 REGEX_CHECK_STATUS;
5673 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5674 REGEX_CHECK_STATUS;
5675 matcher->reset(ut);
5676 result = matcher->find();
5677 if (result != expectMatch) {
5678 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5679 __FILE__, lineNumber, expectMatch, result, pattern, data);
5680 }
5681 delete [] utf8Buffer;
5682
5683 utext_close(ut);
5684 delete [] exactBuffer;
5685 }
5686
5687
5688 void RegexTest::TestBug11371() {
5689 if (quick) {
5690 logln("Skipping test. Runs in exhuastive mode only.");
5691 return;
5692 }
5693 UErrorCode status = U_ZERO_ERROR;
5694 UnicodeString patternString;
5695
5696 for (int i=0; i<8000000; i++) {
5697 patternString.append(UnicodeString("()"));
5698 }
5699 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5700 if (status != U_REGEX_PATTERN_TOO_BIG) {
5701 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5702 __FILE__, __LINE__, u_errorName(status));
5703 }
5704
5705 status = U_ZERO_ERROR;
5706 patternString = "(";
5707 for (int i=0; i<20000000; i++) {
5708 patternString.append(UnicodeString("A++"));
5709 }
5710 patternString.append(UnicodeString("){0}B++"));
5711 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5712 if (status != U_REGEX_PATTERN_TOO_BIG) {
5713 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5714 __FILE__, __LINE__, u_errorName(status));
5715 }
5716
5717 // Pattern with too much string data, such that string indexes overflow operand data field size
5718 // in compiled instruction.
5719 status = U_ZERO_ERROR;
5720 patternString = "";
5721 while (patternString.length() < 0x00ffffff) {
5722 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5723 }
5724 patternString.append(UnicodeString("X? trailing string"));
5725 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5726 if (status != U_REGEX_PATTERN_TOO_BIG) {
5727 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5728 __FILE__, __LINE__, u_errorName(status));
5729 }
5730 }
5731
5732 void RegexTest::TestBug11480() {
5733 // C API, get capture group of a group that does not participate in the match.
5734 // (Returns a zero length string, with nul termination,
5735 // indistinguishable from a group with a zero length match.)
5736
5737 UErrorCode status = U_ZERO_ERROR;
5738 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5739 REGEX_CHECK_STATUS;
5740 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5741 uregex_setText(re, text.getBuffer(), text.length(), &status);
5742 REGEX_CHECK_STATUS;
5743 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5744 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5745 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5746 REGEX_ASSERT(length == 0);
5747 REGEX_ASSERT(buf[0] == 13);
5748 REGEX_ASSERT(buf[1] == 0);
5749 REGEX_ASSERT(buf[2] == 13);
5750 uregex_close(re);
5751
5752 // UText C++ API, length of match is 0 for non-participating matches.
5753 UText ut = UTEXT_INITIALIZER;
5754 utext_openUnicodeString(&ut, &text, &status);
5755 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5756 REGEX_CHECK_STATUS;
5757 matcher.reset(&ut);
5758 REGEX_ASSERT(matcher.lookingAt(0, status));
5759
5760 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5761 int64_t groupLen = -666;
5762 UText group = UTEXT_INITIALIZER;
5763 matcher.group(1, &group, groupLen, status);
5764 REGEX_CHECK_STATUS;
5765 REGEX_ASSERT(groupLen == 1);
5766 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5767
5768 // Capture group 2, the (B), does not participate in the match.
5769 matcher.group(2, &group, groupLen, status);
5770 REGEX_CHECK_STATUS;
5771 REGEX_ASSERT(groupLen == 0);
5772 REGEX_ASSERT(matcher.start(2, status) == -1);
5773 REGEX_CHECK_STATUS;
5774 }
5775
5776 void RegexTest::TestBug12884() {
5777 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5778 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5779 UnicodeString text(u"hello");
5780 UErrorCode status = U_ZERO_ERROR;
5781 RegexMatcher m(pattern, text, 0, status);
5782 REGEX_CHECK_STATUS;
5783 m.setTimeLimit(5, status);
5784 m.find(status);
5785 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5786
5787 // Non-greedy loops. They take a different code path during matching.
5788 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5789 status = U_ZERO_ERROR;
5790 RegexMatcher ngM(ngPattern, text, 0, status);
5791 REGEX_CHECK_STATUS;
5792 ngM.setTimeLimit(5, status);
5793 ngM.find(status);
5794 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5795
5796 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5797 const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5798 "carácter, sin importar la plataforma, sin importar el programa,"
5799 "sin importar el idioma.";
5800 status = U_ZERO_ERROR;
5801 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5802 REGEX_CHECK_STATUS;
5803 m.reset(ut.getAlias());
5804 m.find(status);
5805 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5806
5807 status = U_ZERO_ERROR;
5808 ngM.reset(ut.getAlias());
5809 ngM.find(status);
5810 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5811 }
5812
5813 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5814 // can cause a read past the end of the input text.
5815 // The failure is seen when running this test with Clang's Addresss Sanitizer.
5816
5817 void RegexTest::TestBug13631() {
5818 const UChar *pats[] = { u"(?<!^)",
5819 u"(?<=^)",
5820 nullptr
5821 };
5822 for (const UChar **pat=pats; *pat; ++pat) {
5823 UErrorCode status = U_ZERO_ERROR;
5824 UnicodeString upat(*pat);
5825 RegexMatcher matcher(upat, 0, status);
5826 const UChar s =u'a';
5827 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5828 REGEX_CHECK_STATUS;
5829 matcher.reset(ut);
5830 while (matcher.find()) {
5831 }
5832 utext_close(ut);
5833 }
5834 }
5835
5836 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5837 // where a following group specification would be expected.
5838 // Failure shows when running the test under Clang's Address Sanitizer.
5839
5840 void RegexTest::TestBug13632() {
5841 UErrorCode status = U_ZERO_ERROR;
5842 URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5843 const char16_t *sourceString = u"Hello, world.";
5844 uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5845
5846 const int32_t destCap = 20;
5847 char16_t dest[destCap] = {};
5848 const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
5849 uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5850
5851 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5852 uregex_close(re);
5853 }
5854
5855 void RegexTest::TestBug20359() {
5856 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5857 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5858 // Changed implementation to loop instead of recursing.
5859
5860 UnicodeString pattern;
5861 for (int i=0; i<50000; ++i) {
5862 pattern += u"\\Q\\E";
5863 }
5864 pattern += u"x";
5865
5866 UErrorCode status = U_ZERO_ERROR;
5867 LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5868 0, nullptr, &status));
5869 assertSuccess(WHERE, status);
5870
5871 // We have passed the point where the bug crashed. The following is a small sanity
5872 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5873
5874 uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5875 assertSuccess(WHERE, status);
5876 assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5877 assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5878 assertSuccess(WHERE, status);
5879 }
5880
5881 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */