]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/regextst.cpp
ICU-551.51.4.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
... / ...
CommitLineData
1/********************************************************************
2 * COPYRIGHT:
3 * Copyright (c) 2002-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8// regextst.cpp
9//
10// ICU Regular Expressions test, part of intltest.
11//
12
13/*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
23#include "intltest.h"
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
26#include "unicode/localpointer.h"
27#include "unicode/regex.h"
28#include "unicode/uchar.h"
29#include "unicode/ucnv.h"
30#include "unicode/uniset.h"
31#include "unicode/uregex.h"
32#include "unicode/usetiter.h"
33#include "unicode/ustring.h"
34#include "regextst.h"
35#include "regexcmp.h"
36#include "uvector.h"
37#include "util.h"
38#include <stdlib.h>
39#include <string.h>
40#include <stdio.h>
41#include "cmemory.h"
42#include "cstring.h"
43#include "uinvchar.h"
44
45#define SUPPORT_MUTATING_INPUT_STRING 0
46
47//---------------------------------------------------------------------------
48//
49// Test class boilerplate
50//
51//---------------------------------------------------------------------------
52RegexTest::RegexTest()
53{
54}
55
56
57RegexTest::~RegexTest()
58{
59}
60
61
62
63void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
64{
65 if (exec) logln("TestSuite RegexTest: ");
66 switch (index) {
67
68 case 0: name = "Basic";
69 if (exec) Basic();
70 break;
71 case 1: name = "API_Match";
72 if (exec) API_Match();
73 break;
74 case 2: name = "API_Replace";
75 if (exec) API_Replace();
76 break;
77 case 3: name = "API_Pattern";
78 if (exec) API_Pattern();
79 break;
80 case 4:
81#if !UCONFIG_NO_FILE_IO
82 name = "Extended";
83 if (exec) Extended();
84#else
85 name = "skip";
86#endif
87 break;
88 case 5: name = "Errors";
89 if (exec) Errors();
90 break;
91 case 6: name = "PerlTests";
92 if (exec) PerlTests();
93 break;
94 case 7: name = "Callbacks";
95 if (exec) Callbacks();
96 break;
97 case 8: name = "FindProgressCallbacks";
98 if (exec) FindProgressCallbacks();
99 break;
100 case 9: name = "Bug 6149";
101 if (exec) Bug6149();
102 break;
103 case 10: name = "UTextBasic";
104 if (exec) UTextBasic();
105 break;
106 case 11: name = "API_Match_UTF8";
107 if (exec) API_Match_UTF8();
108 break;
109 case 12: name = "API_Replace_UTF8";
110 if (exec) API_Replace_UTF8();
111 break;
112 case 13: name = "API_Pattern_UTF8";
113 if (exec) API_Pattern_UTF8();
114 break;
115 case 14: name = "PerlTestsUTF8";
116 if (exec) PerlTestsUTF8();
117 break;
118 case 15: name = "PreAllocatedUTextCAPI";
119 if (exec) PreAllocatedUTextCAPI();
120 break;
121 case 16: name = "Bug 7651";
122 if (exec) Bug7651();
123 break;
124 case 17: name = "Bug 7740";
125 if (exec) Bug7740();
126 break;
127 case 18: name = "Bug 8479";
128 if (exec) Bug8479();
129 break;
130 case 19: name = "Bug 7029";
131 if (exec) Bug7029();
132 break;
133 case 20: name = "CheckInvBufSize";
134 if (exec) CheckInvBufSize();
135 break;
136 case 21: name = "Bug 9283";
137 if (exec) Bug9283();
138 break;
139 case 22: name = "Bug10459";
140 if (exec) Bug10459();
141 break;
142 case 23: name = "TestCaseInsensitiveStarters";
143 if (exec) TestCaseInsensitiveStarters();
144 break;
145 case 24: name = "TestBug11049";
146 if (exec) TestBug11049();
147 break;
148 case 25: name = "TestBug11371";
149 if (exec) TestBug11371();
150 break;
151 case 26: name = "TestBug11480";
152 if (exec) TestBug11480();
153 break;
154 case 27: name = "NamedCapture";
155 if (exec) NamedCapture();
156 break;
157 case 28: name = "NamedCaptureLimits";
158 if (exec) NamedCaptureLimits();
159 break;
160 default: name = "";
161 break; //needed to end loop
162 }
163}
164
165
166
167/**
168 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
169 * into ASCII.
170 * @see utext_openUTF8
171 */
172static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
173
174//---------------------------------------------------------------------------
175//
176// Error Checking / Reporting macros used in all of the tests.
177//
178//---------------------------------------------------------------------------
179
180static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
181 int64_t oldIndex = utext_getNativeIndex(text);
182 utext_setNativeIndex(text, 0);
183 char *bufPtr = buf;
184 UChar32 c = utext_next32From(text, 0);
185 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
186 if (0x000020<=c && c<0x00007e) {
187 *bufPtr = c;
188 } else {
189#if 0
190 sprintf(bufPtr,"U+%04X", c);
191 bufPtr+= strlen(bufPtr)-1;
192#else
193 *bufPtr = '%';
194#endif
195 }
196 bufPtr++;
197 c = UTEXT_NEXT32(text);
198 }
199 *bufPtr = 0;
200#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
201 char *ebuf = (char*)malloc(bufLen);
202 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
203 uprv_strncpy(buf, ebuf, bufLen);
204 free((void*)ebuf);
205#endif
206 utext_setNativeIndex(text, oldIndex);
207}
208
209
210static char ASSERT_BUF[1024];
211
212const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
213 if(message.length()==0) {
214 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
215 } else {
216 UnicodeString buf;
217 IntlTest::prettify(message,buf);
218 if(buf.length()==0) {
219 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
220 } else {
221 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
222 if(ASSERT_BUF[0]==0) {
223 ASSERT_BUF[0]=0;
224 for(int32_t i=0;i<buf.length();i++) {
225 UChar ch = buf[i];
226 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
227 }
228 }
229 }
230 }
231 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
232 return ASSERT_BUF;
233}
234
235#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
236
237#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
238 __FILE__, __LINE__, u_errorName(status)); return;}}
239
240#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
241
242#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
243if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
244 __LINE__, u_errorName(errcode), u_errorName(status));};}
245
246#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
247 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
248
249#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
250 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
251
252// expected: const char * , restricted to invariant characters.
253// actual: const UnicodeString &
254#define REGEX_ASSERT_UNISTR(expected, actual) { \
255 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
256 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
257 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
258
259
260static UBool testUTextEqual(UText *uta, UText *utb) {
261 UChar32 ca = 0;
262 UChar32 cb = 0;
263 utext_setNativeIndex(uta, 0);
264 utext_setNativeIndex(utb, 0);
265 do {
266 ca = utext_next32(uta);
267 cb = utext_next32(utb);
268 if (ca != cb) {
269 break;
270 }
271 } while (ca != U_SENTINEL);
272 return ca == cb;
273}
274
275
276/**
277 * @param expected expected text in UTF-8 (not platform) codepage
278 */
279void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
280 UErrorCode status = U_ZERO_ERROR;
281 UText expectedText = UTEXT_INITIALIZER;
282 utext_openUTF8(&expectedText, expected, -1, &status);
283 if(U_FAILURE(status)) {
284 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285 return;
286 }
287 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
288 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
289 return;
290 }
291 utext_setNativeIndex(actual, 0);
292 if (!testUTextEqual(&expectedText, actual)) {
293 char buf[201 /*21*/];
294 char expectedBuf[201];
295 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
296 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
297 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
298 }
299 utext_close(&expectedText);
300}
301/**
302 * @param expected invariant (platform local text) input
303 */
304
305void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
306 UErrorCode status = U_ZERO_ERROR;
307 UText expectedText = UTEXT_INITIALIZER;
308 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
309 if(U_FAILURE(status)) {
310 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
311 return;
312 }
313 utext_setNativeIndex(actual, 0);
314 if (!testUTextEqual(&expectedText, actual)) {
315 char buf[201 /*21*/];
316 char expectedBuf[201];
317 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
318 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
319 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
320 }
321 utext_close(&expectedText);
322}
323
324/**
325 * Assumes utf-8 input
326 */
327#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
328/**
329 * Assumes Invariant input
330 */
331#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
332
333/**
334 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
335 * passed into utext_openUTF8. An error will be given if
336 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
337 */
338
339#define INV_BUFSIZ 2048 /* increase this if too small */
340
341static int64_t inv_next=0;
342
343#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
344static char inv_buf[INV_BUFSIZ];
345#endif
346
347static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
348 if(length==-1) length=strlen(inv);
349#if U_CHARSET_FAMILY==U_ASCII_FAMILY
350 inv_next+=length;
351 return utext_openUTF8(ut, inv, length, status);
352#else
353 if(inv_next+length+1>INV_BUFSIZ) {
354 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
355 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
356 *status = U_MEMORY_ALLOCATION_ERROR;
357 return NULL;
358 }
359
360 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
361 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
362 inv_next+=length;
363
364#if 0
365 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
366#endif
367
368 return utext_openUTF8(ut, (const char*)buf, length, status);
369#endif
370}
371
372
373//---------------------------------------------------------------------------
374//
375// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
376// for the LookingAt() and Match() functions.
377//
378// usage:
379// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
380//
381// The expected results are UBool - TRUE or FALSE.
382// The input text is unescaped. The pattern is not.
383//
384//
385//---------------------------------------------------------------------------
386
387#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
388
389UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
390 const UnicodeString pattern(pat, -1, US_INV);
391 const UnicodeString inputText(text, -1, US_INV);
392 UErrorCode status = U_ZERO_ERROR;
393 UParseError pe;
394 RegexPattern *REPattern = NULL;
395 RegexMatcher *REMatcher = NULL;
396 UBool retVal = TRUE;
397
398 UnicodeString patString(pat, -1, US_INV);
399 REPattern = RegexPattern::compile(patString, 0, pe, status);
400 if (U_FAILURE(status)) {
401 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
402 line, u_errorName(status));
403 return FALSE;
404 }
405 if (line==376) { REPattern->dumpPattern();}
406
407 UnicodeString inputString(inputText);
408 UnicodeString unEscapedInput = inputString.unescape();
409 REMatcher = REPattern->matcher(unEscapedInput, status);
410 if (U_FAILURE(status)) {
411 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
412 line, u_errorName(status));
413 return FALSE;
414 }
415
416 UBool actualmatch;
417 actualmatch = REMatcher->lookingAt(status);
418 if (U_FAILURE(status)) {
419 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
420 line, u_errorName(status));
421 retVal = FALSE;
422 }
423 if (actualmatch != looking) {
424 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
425 retVal = FALSE;
426 }
427
428 status = U_ZERO_ERROR;
429 actualmatch = REMatcher->matches(status);
430 if (U_FAILURE(status)) {
431 errln("RegexTest failure in matches() at line %d. Status = %s\n",
432 line, u_errorName(status));
433 retVal = FALSE;
434 }
435 if (actualmatch != match) {
436 errln("RegexTest: wrong return from matches() at line %d.\n", line);
437 retVal = FALSE;
438 }
439
440 if (retVal == FALSE) {
441 REPattern->dumpPattern();
442 }
443
444 delete REPattern;
445 delete REMatcher;
446 return retVal;
447}
448
449
450UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
451 UText pattern = UTEXT_INITIALIZER;
452 int32_t inputUTF8Length;
453 char *textChars = NULL;
454 UText inputText = UTEXT_INITIALIZER;
455 UErrorCode status = U_ZERO_ERROR;
456 UParseError pe;
457 RegexPattern *REPattern = NULL;
458 RegexMatcher *REMatcher = NULL;
459 UBool retVal = TRUE;
460
461 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
462 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
463 if (U_FAILURE(status)) {
464 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
465 line, u_errorName(status));
466 return FALSE;
467 }
468
469 UnicodeString inputString(text, -1, US_INV);
470 UnicodeString unEscapedInput = inputString.unescape();
471 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
472 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
473
474 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
475 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
476 // UTF-8 does not allow unpaired surrogates, so this could actually happen
477 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
478 return TRUE; // not a failure of the Regex engine
479 }
480 status = U_ZERO_ERROR; // buffer overflow
481 textChars = new char[inputUTF8Length+1];
482 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
483 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
484
485 REMatcher = &REPattern->matcher(status)->reset(&inputText);
486 if (U_FAILURE(status)) {
487 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
488 line, u_errorName(status));
489 return FALSE;
490 }
491
492 UBool actualmatch;
493 actualmatch = REMatcher->lookingAt(status);
494 if (U_FAILURE(status)) {
495 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
496 line, u_errorName(status));
497 retVal = FALSE;
498 }
499 if (actualmatch != looking) {
500 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
501 retVal = FALSE;
502 }
503
504 status = U_ZERO_ERROR;
505 actualmatch = REMatcher->matches(status);
506 if (U_FAILURE(status)) {
507 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
508 line, u_errorName(status));
509 retVal = FALSE;
510 }
511 if (actualmatch != match) {
512 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
513 retVal = FALSE;
514 }
515
516 if (retVal == FALSE) {
517 REPattern->dumpPattern();
518 }
519
520 delete REPattern;
521 delete REMatcher;
522 utext_close(&inputText);
523 utext_close(&pattern);
524 delete[] textChars;
525 return retVal;
526}
527
528
529
530//---------------------------------------------------------------------------
531//
532// REGEX_ERR Macro + invocation function to simplify writing tests
533// regex tests for incorrect patterns
534//
535// usage:
536// REGEX_ERR("pattern", expected error line, column, expected status);
537//
538//---------------------------------------------------------------------------
539#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
540
541void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
542 UErrorCode expectedStatus, int32_t line) {
543 UnicodeString pattern(pat);
544
545 UErrorCode status = U_ZERO_ERROR;
546 UParseError pe;
547 RegexPattern *callerPattern = NULL;
548
549 //
550 // Compile the caller's pattern
551 //
552 UnicodeString patString(pat);
553 callerPattern = RegexPattern::compile(patString, 0, pe, status);
554 if (status != expectedStatus) {
555 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
556 } else {
557 if (status != U_ZERO_ERROR) {
558 if (pe.line != errLine || pe.offset != errCol) {
559 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
560 line, errLine, errCol, pe.line, pe.offset);
561 }
562 }
563 }
564
565 delete callerPattern;
566
567 //
568 // Compile again, using a UTF-8-based UText
569 //
570 UText patternText = UTEXT_INITIALIZER;
571 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
572 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
573 if (status != expectedStatus) {
574 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
575 } else {
576 if (status != U_ZERO_ERROR) {
577 if (pe.line != errLine || pe.offset != errCol) {
578 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
579 line, errLine, errCol, pe.line, pe.offset);
580 }
581 }
582 }
583
584 delete callerPattern;
585 utext_close(&patternText);
586}
587
588
589
590//---------------------------------------------------------------------------
591//
592// Basic Check for basic functionality of regex pattern matching.
593// Avoid the use of REGEX_FIND test macro, which has
594// substantial dependencies on basic Regex functionality.
595//
596//---------------------------------------------------------------------------
597void RegexTest::Basic() {
598
599
600//
601// Debug - slide failing test cases early
602//
603#if 0
604 {
605 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
606 UParseError pe;
607 UErrorCode status = U_ZERO_ERROR;
608 RegexPattern *pattern;
609 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
610 pattern->dumpPattern();
611 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
612 UBool result = m->find();
613 printf("result = %d\n", result);
614 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
615 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
616 }
617 exit(1);
618#endif
619
620
621 //
622 // Pattern with parentheses
623 //
624 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
625 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
626 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
627
628 //
629 // Patterns with *
630 //
631 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
632 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
633 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
634 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
635 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
636
637 REGEX_TESTLM("a*", "", TRUE, TRUE);
638 REGEX_TESTLM("a*", "b", TRUE, FALSE);
639
640
641 //
642 // Patterns with "."
643 //
644 REGEX_TESTLM(".", "abc", TRUE, FALSE);
645 REGEX_TESTLM("...", "abc", TRUE, TRUE);
646 REGEX_TESTLM("....", "abc", FALSE, FALSE);
647 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
648 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
649 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
650 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
651 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
652
653 //
654 // Patterns with * applied to chars at end of literal string
655 //
656 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
657 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
658
659 //
660 // Supplemental chars match as single chars, not a pair of surrogates.
661 //
662 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
663 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
664 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
665
666
667 //
668 // UnicodeSets in the pattern
669 //
670 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
671 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
672 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
673 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
674 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
675 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
676
677 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
678 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
679 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
680 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
681 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
682
683 //
684 // OR operator in patterns
685 //
686 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
687 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
688 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
689 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
690
691 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
692 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
693 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
694 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
695 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
696 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
697
698 //
699 // +
700 //
701 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
702 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
703 REGEX_TESTLM("b+", "", FALSE, FALSE);
704 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
705 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
706 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
707
708 //
709 // ?
710 //
711 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
712 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
713 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
714 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
715 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
716 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
717 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
718 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
719 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
720
721 //
722 // Escape sequences that become single literal chars, handled internally
723 // by ICU's Unescape.
724 //
725
726 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
727 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
728 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
729 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
730 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
731 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
732 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
733 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
734 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
735 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
736
737 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
738 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
739
740 // Escape of special chars in patterns
741 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
742}
743
744
745//---------------------------------------------------------------------------
746//
747// UTextBasic Check for quirks that are specific to the UText
748// implementation.
749//
750//---------------------------------------------------------------------------
751void RegexTest::UTextBasic() {
752 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
753 UErrorCode status = U_ZERO_ERROR;
754 UText pattern = UTEXT_INITIALIZER;
755 utext_openUTF8(&pattern, str_abc, -1, &status);
756 RegexMatcher matcher(&pattern, 0, status);
757 REGEX_CHECK_STATUS;
758
759 UText input = UTEXT_INITIALIZER;
760 utext_openUTF8(&input, str_abc, -1, &status);
761 REGEX_CHECK_STATUS;
762 matcher.reset(&input);
763 REGEX_CHECK_STATUS;
764 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
765
766 matcher.reset(matcher.inputText());
767 REGEX_CHECK_STATUS;
768 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
769
770 utext_close(&pattern);
771 utext_close(&input);
772}
773
774
775//---------------------------------------------------------------------------
776//
777// API_Match Test that the API for class RegexMatcher
778// is present and nominally working, but excluding functions
779// implementing replace operations.
780//
781//---------------------------------------------------------------------------
782void RegexTest::API_Match() {
783 UParseError pe;
784 UErrorCode status=U_ZERO_ERROR;
785 int32_t flags = 0;
786
787 //
788 // Debug - slide failing test cases early
789 //
790#if 0
791 {
792 }
793 return;
794#endif
795
796 //
797 // Simple pattern compilation
798 //
799 {
800 UnicodeString re("abc");
801 RegexPattern *pat2;
802 pat2 = RegexPattern::compile(re, flags, pe, status);
803 REGEX_CHECK_STATUS;
804
805 UnicodeString inStr1 = "abcdef this is a test";
806 UnicodeString instr2 = "not abc";
807 UnicodeString empty = "";
808
809
810 //
811 // Matcher creation and reset.
812 //
813 RegexMatcher *m1 = pat2->matcher(inStr1, status);
814 REGEX_CHECK_STATUS;
815 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
816 REGEX_ASSERT(m1->input() == inStr1);
817 m1->reset(instr2);
818 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
819 REGEX_ASSERT(m1->input() == instr2);
820 m1->reset(inStr1);
821 REGEX_ASSERT(m1->input() == inStr1);
822 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
823 m1->reset(empty);
824 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
825 REGEX_ASSERT(m1->input() == empty);
826 REGEX_ASSERT(&m1->pattern() == pat2);
827
828 //
829 // reset(pos, status)
830 //
831 m1->reset(inStr1);
832 m1->reset(4, status);
833 REGEX_CHECK_STATUS;
834 REGEX_ASSERT(m1->input() == inStr1);
835 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
836
837 m1->reset(-1, status);
838 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
839 status = U_ZERO_ERROR;
840
841 m1->reset(0, status);
842 REGEX_CHECK_STATUS;
843 status = U_ZERO_ERROR;
844
845 int32_t len = m1->input().length();
846 m1->reset(len-1, status);
847 REGEX_CHECK_STATUS;
848 status = U_ZERO_ERROR;
849
850 m1->reset(len, status);
851 REGEX_CHECK_STATUS;
852 status = U_ZERO_ERROR;
853
854 m1->reset(len+1, status);
855 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
856 status = U_ZERO_ERROR;
857
858 //
859 // match(pos, status)
860 //
861 m1->reset(instr2);
862 REGEX_ASSERT(m1->matches(4, status) == TRUE);
863 m1->reset();
864 REGEX_ASSERT(m1->matches(3, status) == FALSE);
865 m1->reset();
866 REGEX_ASSERT(m1->matches(5, status) == FALSE);
867 REGEX_ASSERT(m1->matches(4, status) == TRUE);
868 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
869 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
870
871 // Match() at end of string should fail, but should not
872 // be an error.
873 status = U_ZERO_ERROR;
874 len = m1->input().length();
875 REGEX_ASSERT(m1->matches(len, status) == FALSE);
876 REGEX_CHECK_STATUS;
877
878 // Match beyond end of string should fail with an error.
879 status = U_ZERO_ERROR;
880 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
881 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
882
883 // Successful match at end of string.
884 {
885 status = U_ZERO_ERROR;
886 RegexMatcher m("A?", 0, status); // will match zero length string.
887 REGEX_CHECK_STATUS;
888 m.reset(inStr1);
889 len = inStr1.length();
890 REGEX_ASSERT(m.matches(len, status) == TRUE);
891 REGEX_CHECK_STATUS;
892 m.reset(empty);
893 REGEX_ASSERT(m.matches(0, status) == TRUE);
894 REGEX_CHECK_STATUS;
895 }
896
897
898 //
899 // lookingAt(pos, status)
900 //
901 status = U_ZERO_ERROR;
902 m1->reset(instr2); // "not abc"
903 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
904 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
905 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
906 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
908 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
909 status = U_ZERO_ERROR;
910 len = m1->input().length();
911 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
912 REGEX_CHECK_STATUS;
913 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
914 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
915
916 delete m1;
917 delete pat2;
918 }
919
920
921 //
922 // Capture Group.
923 // RegexMatcher::start();
924 // RegexMatcher::end();
925 // RegexMatcher::groupCount();
926 //
927 {
928 int32_t flags=0;
929 UParseError pe;
930 UErrorCode status=U_ZERO_ERROR;
931
932 UnicodeString re("01(23(45)67)(.*)");
933 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
934 REGEX_CHECK_STATUS;
935 UnicodeString data = "0123456789";
936
937 RegexMatcher *matcher = pat->matcher(data, status);
938 REGEX_CHECK_STATUS;
939 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
940 static const int32_t matchStarts[] = {0, 2, 4, 8};
941 static const int32_t matchEnds[] = {10, 8, 6, 10};
942 int32_t i;
943 for (i=0; i<4; i++) {
944 int32_t actualStart = matcher->start(i, status);
945 REGEX_CHECK_STATUS;
946 if (actualStart != matchStarts[i]) {
947 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
948 __LINE__, i, matchStarts[i], actualStart);
949 }
950 int32_t actualEnd = matcher->end(i, status);
951 REGEX_CHECK_STATUS;
952 if (actualEnd != matchEnds[i]) {
953 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
954 __LINE__, i, matchEnds[i], actualEnd);
955 }
956 }
957
958 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
959 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
960
961 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
962 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
963 matcher->reset();
964 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
965
966 matcher->lookingAt(status);
967 REGEX_ASSERT(matcher->group(status) == "0123456789");
968 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
969 REGEX_ASSERT(matcher->group(1, status) == "234567" );
970 REGEX_ASSERT(matcher->group(2, status) == "45" );
971 REGEX_ASSERT(matcher->group(3, status) == "89" );
972 REGEX_CHECK_STATUS;
973 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
974 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
975 matcher->reset();
976 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
977
978 delete matcher;
979 delete pat;
980
981 }
982
983 //
984 // find
985 //
986 {
987 int32_t flags=0;
988 UParseError pe;
989 UErrorCode status=U_ZERO_ERROR;
990
991 UnicodeString re("abc");
992 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
993 REGEX_CHECK_STATUS;
994 UnicodeString data = ".abc..abc...abc..";
995 // 012345678901234567
996
997 RegexMatcher *matcher = pat->matcher(data, status);
998 REGEX_CHECK_STATUS;
999 REGEX_ASSERT(matcher->find());
1000 REGEX_ASSERT(matcher->start(status) == 1);
1001 REGEX_ASSERT(matcher->find());
1002 REGEX_ASSERT(matcher->start(status) == 6);
1003 REGEX_ASSERT(matcher->find());
1004 REGEX_ASSERT(matcher->start(status) == 12);
1005 REGEX_ASSERT(matcher->find() == FALSE);
1006 REGEX_ASSERT(matcher->find() == FALSE);
1007
1008 matcher->reset();
1009 REGEX_ASSERT(matcher->find());
1010 REGEX_ASSERT(matcher->start(status) == 1);
1011
1012 REGEX_ASSERT(matcher->find(0, status));
1013 REGEX_ASSERT(matcher->start(status) == 1);
1014 REGEX_ASSERT(matcher->find(1, status));
1015 REGEX_ASSERT(matcher->start(status) == 1);
1016 REGEX_ASSERT(matcher->find(2, status));
1017 REGEX_ASSERT(matcher->start(status) == 6);
1018 REGEX_ASSERT(matcher->find(12, status));
1019 REGEX_ASSERT(matcher->start(status) == 12);
1020 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1021 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1022 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1023 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1024
1025 status = U_ZERO_ERROR;
1026 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1027 status = U_ZERO_ERROR;
1028 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1029
1030 REGEX_ASSERT(matcher->groupCount() == 0);
1031
1032 delete matcher;
1033 delete pat;
1034 }
1035
1036
1037 //
1038 // find, with \G in pattern (true if at the end of a previous match).
1039 //
1040 {
1041 int32_t flags=0;
1042 UParseError pe;
1043 UErrorCode status=U_ZERO_ERROR;
1044
1045 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1046 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1047 REGEX_CHECK_STATUS;
1048 UnicodeString data = ".abcabc.abc..";
1049 // 012345678901234567
1050
1051 RegexMatcher *matcher = pat->matcher(data, status);
1052 REGEX_CHECK_STATUS;
1053 REGEX_ASSERT(matcher->find());
1054 REGEX_ASSERT(matcher->start(status) == 0);
1055 REGEX_ASSERT(matcher->start(1, status) == -1);
1056 REGEX_ASSERT(matcher->start(2, status) == 1);
1057
1058 REGEX_ASSERT(matcher->find());
1059 REGEX_ASSERT(matcher->start(status) == 4);
1060 REGEX_ASSERT(matcher->start(1, status) == 4);
1061 REGEX_ASSERT(matcher->start(2, status) == -1);
1062 REGEX_CHECK_STATUS;
1063
1064 delete matcher;
1065 delete pat;
1066 }
1067
1068 //
1069 // find with zero length matches, match position should bump ahead
1070 // to prevent loops.
1071 //
1072 {
1073 int32_t i;
1074 UErrorCode status=U_ZERO_ERROR;
1075 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1076 // using an always-true look-ahead.
1077 REGEX_CHECK_STATUS;
1078 UnicodeString s(" ");
1079 m.reset(s);
1080 for (i=0; ; i++) {
1081 if (m.find() == FALSE) {
1082 break;
1083 }
1084 REGEX_ASSERT(m.start(status) == i);
1085 REGEX_ASSERT(m.end(status) == i);
1086 }
1087 REGEX_ASSERT(i==5);
1088
1089 // Check that the bump goes over surrogate pairs OK
1090 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1091 s = s.unescape();
1092 m.reset(s);
1093 for (i=0; ; i+=2) {
1094 if (m.find() == FALSE) {
1095 break;
1096 }
1097 REGEX_ASSERT(m.start(status) == i);
1098 REGEX_ASSERT(m.end(status) == i);
1099 }
1100 REGEX_ASSERT(i==10);
1101 }
1102 {
1103 // find() loop breaking test.
1104 // with pattern of /.?/, should see a series of one char matches, then a single
1105 // match of zero length at the end of the input string.
1106 int32_t i;
1107 UErrorCode status=U_ZERO_ERROR;
1108 RegexMatcher m(".?", 0, status);
1109 REGEX_CHECK_STATUS;
1110 UnicodeString s(" ");
1111 m.reset(s);
1112 for (i=0; ; i++) {
1113 if (m.find() == FALSE) {
1114 break;
1115 }
1116 REGEX_ASSERT(m.start(status) == i);
1117 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1118 }
1119 REGEX_ASSERT(i==5);
1120 }
1121
1122
1123 //
1124 // Matchers with no input string behave as if they had an empty input string.
1125 //
1126
1127 {
1128 UErrorCode status = U_ZERO_ERROR;
1129 RegexMatcher m(".?", 0, status);
1130 REGEX_CHECK_STATUS;
1131 REGEX_ASSERT(m.find());
1132 REGEX_ASSERT(m.start(status) == 0);
1133 REGEX_ASSERT(m.input() == "");
1134 }
1135 {
1136 UErrorCode status = U_ZERO_ERROR;
1137 RegexPattern *p = RegexPattern::compile(".", 0, status);
1138 RegexMatcher *m = p->matcher(status);
1139 REGEX_CHECK_STATUS;
1140
1141 REGEX_ASSERT(m->find() == FALSE);
1142 REGEX_ASSERT(m->input() == "");
1143 delete m;
1144 delete p;
1145 }
1146
1147 //
1148 // Regions
1149 //
1150 {
1151 UErrorCode status = U_ZERO_ERROR;
1152 UnicodeString testString("This is test data");
1153 RegexMatcher m(".*", testString, 0, status);
1154 REGEX_CHECK_STATUS;
1155 REGEX_ASSERT(m.regionStart() == 0);
1156 REGEX_ASSERT(m.regionEnd() == testString.length());
1157 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1158 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1159
1160 m.region(2,4, status);
1161 REGEX_CHECK_STATUS;
1162 REGEX_ASSERT(m.matches(status));
1163 REGEX_ASSERT(m.start(status)==2);
1164 REGEX_ASSERT(m.end(status)==4);
1165 REGEX_CHECK_STATUS;
1166
1167 m.reset();
1168 REGEX_ASSERT(m.regionStart() == 0);
1169 REGEX_ASSERT(m.regionEnd() == testString.length());
1170
1171 UnicodeString shorterString("short");
1172 m.reset(shorterString);
1173 REGEX_ASSERT(m.regionStart() == 0);
1174 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1175
1176 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1177 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1178 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1179 REGEX_ASSERT(&m == &m.reset());
1180 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1181
1182 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1183 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1184 REGEX_ASSERT(&m == &m.reset());
1185 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1186
1187 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1188 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1189 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1190 REGEX_ASSERT(&m == &m.reset());
1191 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1192
1193 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1194 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1195 REGEX_ASSERT(&m == &m.reset());
1196 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1197
1198 }
1199
1200 //
1201 // hitEnd() and requireEnd()
1202 //
1203 {
1204 UErrorCode status = U_ZERO_ERROR;
1205 UnicodeString testString("aabb");
1206 RegexMatcher m1(".*", testString, 0, status);
1207 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1208 REGEX_ASSERT(m1.hitEnd() == TRUE);
1209 REGEX_ASSERT(m1.requireEnd() == FALSE);
1210 REGEX_CHECK_STATUS;
1211
1212 status = U_ZERO_ERROR;
1213 RegexMatcher m2("a*", testString, 0, status);
1214 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1215 REGEX_ASSERT(m2.hitEnd() == FALSE);
1216 REGEX_ASSERT(m2.requireEnd() == FALSE);
1217 REGEX_CHECK_STATUS;
1218
1219 status = U_ZERO_ERROR;
1220 RegexMatcher m3(".*$", testString, 0, status);
1221 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1222 REGEX_ASSERT(m3.hitEnd() == TRUE);
1223 REGEX_ASSERT(m3.requireEnd() == TRUE);
1224 REGEX_CHECK_STATUS;
1225 }
1226
1227
1228 //
1229 // Compilation error on reset with UChar *
1230 // These were a hazard that people were stumbling over with runtime errors.
1231 // Changed them to compiler errors by adding private methods that more closely
1232 // matched the incorrect use of the functions.
1233 //
1234#if 0
1235 {
1236 UErrorCode status = U_ZERO_ERROR;
1237 UChar ucharString[20];
1238 RegexMatcher m(".", 0, status);
1239 m.reset(ucharString); // should not compile.
1240
1241 RegexPattern *p = RegexPattern::compile(".", 0, status);
1242 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1243
1244 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1245 }
1246#endif
1247
1248 //
1249 // Time Outs.
1250 // Note: These tests will need to be changed when the regexp engine is
1251 // able to detect and cut short the exponential time behavior on
1252 // this type of match.
1253 //
1254 {
1255 UErrorCode status = U_ZERO_ERROR;
1256 // Enough 'a's in the string to cause the match to time out.
1257 // (Each on additonal 'a' doubles the time)
1258 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1259 RegexMatcher matcher("(a+)+b", testString, 0, status);
1260 REGEX_CHECK_STATUS;
1261 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1262 matcher.setTimeLimit(100, status);
1263 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1264 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1265 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1266 }
1267 {
1268 UErrorCode status = U_ZERO_ERROR;
1269 // Few enough 'a's to slip in under the time limit.
1270 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1271 RegexMatcher matcher("(a+)+b", testString, 0, status);
1272 REGEX_CHECK_STATUS;
1273 matcher.setTimeLimit(100, status);
1274 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1275 REGEX_CHECK_STATUS;
1276 }
1277
1278 //
1279 // Stack Limits
1280 //
1281 {
1282 UErrorCode status = U_ZERO_ERROR;
1283 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1284
1285 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1286 // of the '+', and makes the stack frames larger.
1287 RegexMatcher matcher("(A)+A$", testString, 0, status);
1288
1289 // With the default stack, this match should fail to run
1290 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1291 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1292
1293 // With unlimited stack, it should run
1294 status = U_ZERO_ERROR;
1295 matcher.setStackLimit(0, status);
1296 REGEX_CHECK_STATUS;
1297 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1298 REGEX_CHECK_STATUS;
1299 REGEX_ASSERT(matcher.getStackLimit() == 0);
1300
1301 // With a limited stack, it the match should fail
1302 status = U_ZERO_ERROR;
1303 matcher.setStackLimit(10000, status);
1304 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1305 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1306 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1307 }
1308
1309 // A pattern that doesn't save state should work with
1310 // a minimal sized stack
1311 {
1312 UErrorCode status = U_ZERO_ERROR;
1313 UnicodeString testString = "abc";
1314 RegexMatcher matcher("abc", testString, 0, status);
1315 REGEX_CHECK_STATUS;
1316 matcher.setStackLimit(30, status);
1317 REGEX_CHECK_STATUS;
1318 REGEX_ASSERT(matcher.matches(status) == TRUE);
1319 REGEX_CHECK_STATUS;
1320 REGEX_ASSERT(matcher.getStackLimit() == 30);
1321
1322 // Negative stack sizes should fail
1323 status = U_ZERO_ERROR;
1324 matcher.setStackLimit(1000, status);
1325 REGEX_CHECK_STATUS;
1326 matcher.setStackLimit(-1, status);
1327 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1328 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1329 }
1330
1331
1332}
1333
1334
1335
1336
1337
1338
1339//---------------------------------------------------------------------------
1340//
1341// API_Replace API test for class RegexMatcher, testing the
1342// Replace family of functions.
1343//
1344//---------------------------------------------------------------------------
1345void RegexTest::API_Replace() {
1346 //
1347 // Replace
1348 //
1349 int32_t flags=0;
1350 UParseError pe;
1351 UErrorCode status=U_ZERO_ERROR;
1352
1353 UnicodeString re("abc");
1354 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1355 REGEX_CHECK_STATUS;
1356 UnicodeString data = ".abc..abc...abc..";
1357 // 012345678901234567
1358 RegexMatcher *matcher = pat->matcher(data, status);
1359
1360 //
1361 // Plain vanilla matches.
1362 //
1363 UnicodeString dest;
1364 dest = matcher->replaceFirst("yz", status);
1365 REGEX_CHECK_STATUS;
1366 REGEX_ASSERT(dest == ".yz..abc...abc..");
1367
1368 dest = matcher->replaceAll("yz", status);
1369 REGEX_CHECK_STATUS;
1370 REGEX_ASSERT(dest == ".yz..yz...yz..");
1371
1372 //
1373 // Plain vanilla non-matches.
1374 //
1375 UnicodeString d2 = ".abx..abx...abx..";
1376 matcher->reset(d2);
1377 dest = matcher->replaceFirst("yz", status);
1378 REGEX_CHECK_STATUS;
1379 REGEX_ASSERT(dest == ".abx..abx...abx..");
1380
1381 dest = matcher->replaceAll("yz", status);
1382 REGEX_CHECK_STATUS;
1383 REGEX_ASSERT(dest == ".abx..abx...abx..");
1384
1385 //
1386 // Empty source string
1387 //
1388 UnicodeString d3 = "";
1389 matcher->reset(d3);
1390 dest = matcher->replaceFirst("yz", status);
1391 REGEX_CHECK_STATUS;
1392 REGEX_ASSERT(dest == "");
1393
1394 dest = matcher->replaceAll("yz", status);
1395 REGEX_CHECK_STATUS;
1396 REGEX_ASSERT(dest == "");
1397
1398 //
1399 // Empty substitution string
1400 //
1401 matcher->reset(data); // ".abc..abc...abc.."
1402 dest = matcher->replaceFirst("", status);
1403 REGEX_CHECK_STATUS;
1404 REGEX_ASSERT(dest == "...abc...abc..");
1405
1406 dest = matcher->replaceAll("", status);
1407 REGEX_CHECK_STATUS;
1408 REGEX_ASSERT(dest == "........");
1409
1410 //
1411 // match whole string
1412 //
1413 UnicodeString d4 = "abc";
1414 matcher->reset(d4);
1415 dest = matcher->replaceFirst("xyz", status);
1416 REGEX_CHECK_STATUS;
1417 REGEX_ASSERT(dest == "xyz");
1418
1419 dest = matcher->replaceAll("xyz", status);
1420 REGEX_CHECK_STATUS;
1421 REGEX_ASSERT(dest == "xyz");
1422
1423 //
1424 // Capture Group, simple case
1425 //
1426 UnicodeString re2("a(..)");
1427 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1428 REGEX_CHECK_STATUS;
1429 UnicodeString d5 = "abcdefg";
1430 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1431 REGEX_CHECK_STATUS;
1432 dest = matcher2->replaceFirst("$1$1", status);
1433 REGEX_CHECK_STATUS;
1434 REGEX_ASSERT(dest == "bcbcdefg");
1435
1436 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1437 REGEX_CHECK_STATUS;
1438 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1439
1440 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1441 REGEX_ASSERT(U_FAILURE(status));
1442 status = U_ZERO_ERROR;
1443
1444 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1445 replacement = replacement.unescape();
1446 dest = matcher2->replaceFirst(replacement, status);
1447 REGEX_CHECK_STATUS;
1448 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1449
1450 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1451
1452
1453 //
1454 // Replacement String with \u hex escapes
1455 //
1456 {
1457 UnicodeString src = "abc 1 abc 2 abc 3";
1458 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1459 matcher->reset(src);
1460 UnicodeString result = matcher->replaceAll(substitute, status);
1461 REGEX_CHECK_STATUS;
1462 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1463 }
1464 {
1465 UnicodeString src = "abc !";
1466 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1467 matcher->reset(src);
1468 UnicodeString result = matcher->replaceAll(substitute, status);
1469 REGEX_CHECK_STATUS;
1470 UnicodeString expected = UnicodeString("--");
1471 expected.append((UChar32)0x10000);
1472 expected.append("-- !");
1473 REGEX_ASSERT(result == expected);
1474 }
1475 // TODO: need more through testing of capture substitutions.
1476
1477 // Bug 4057
1478 //
1479 {
1480 status = U_ZERO_ERROR;
1481 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1482 RegexMatcher m("ss(.*?)ee", 0, status);
1483 REGEX_CHECK_STATUS;
1484 UnicodeString result;
1485
1486 // Multiple finds do NOT bump up the previous appendReplacement postion.
1487 m.reset(s);
1488 m.find();
1489 m.find();
1490 m.appendReplacement(result, "ooh", status);
1491 REGEX_CHECK_STATUS;
1492 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1493
1494 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1495 status = U_ZERO_ERROR;
1496 result.truncate(0);
1497 m.reset(10, status);
1498 m.find();
1499 m.find();
1500 m.appendReplacement(result, "ooh", status);
1501 REGEX_CHECK_STATUS;
1502 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1503
1504 // find() at interior of string, appendReplacemnt still starts at beginning.
1505 status = U_ZERO_ERROR;
1506 result.truncate(0);
1507 m.reset();
1508 m.find(10, status);
1509 m.find();
1510 m.appendReplacement(result, "ooh", status);
1511 REGEX_CHECK_STATUS;
1512 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1513
1514 m.appendTail(result);
1515 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1516
1517 }
1518
1519 delete matcher2;
1520 delete pat2;
1521 delete matcher;
1522 delete pat;
1523}
1524
1525
1526//---------------------------------------------------------------------------
1527//
1528// API_Pattern Test that the API for class RegexPattern is
1529// present and nominally working.
1530//
1531//---------------------------------------------------------------------------
1532void RegexTest::API_Pattern() {
1533 RegexPattern pata; // Test default constructor to not crash.
1534 RegexPattern patb;
1535
1536 REGEX_ASSERT(pata == patb);
1537 REGEX_ASSERT(pata == pata);
1538
1539 UnicodeString re1("abc[a-l][m-z]");
1540 UnicodeString re2("def");
1541 UErrorCode status = U_ZERO_ERROR;
1542 UParseError pe;
1543
1544 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1545 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1546 REGEX_CHECK_STATUS;
1547 REGEX_ASSERT(*pat1 == *pat1);
1548 REGEX_ASSERT(*pat1 != pata);
1549
1550 // Assign
1551 patb = *pat1;
1552 REGEX_ASSERT(patb == *pat1);
1553
1554 // Copy Construct
1555 RegexPattern patc(*pat1);
1556 REGEX_ASSERT(patc == *pat1);
1557 REGEX_ASSERT(patb == patc);
1558 REGEX_ASSERT(pat1 != pat2);
1559 patb = *pat2;
1560 REGEX_ASSERT(patb != patc);
1561 REGEX_ASSERT(patb == *pat2);
1562
1563 // Compile with no flags.
1564 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1565 REGEX_ASSERT(*pat1a == *pat1);
1566
1567 REGEX_ASSERT(pat1a->flags() == 0);
1568
1569 // Compile with different flags should be not equal
1570 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1571 REGEX_CHECK_STATUS;
1572
1573 REGEX_ASSERT(*pat1b != *pat1a);
1574 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1575 REGEX_ASSERT(pat1a->flags() == 0);
1576 delete pat1b;
1577
1578 // clone
1579 RegexPattern *pat1c = pat1->clone();
1580 REGEX_ASSERT(*pat1c == *pat1);
1581 REGEX_ASSERT(*pat1c != *pat2);
1582
1583 delete pat1c;
1584 delete pat1a;
1585 delete pat1;
1586 delete pat2;
1587
1588
1589 //
1590 // Verify that a matcher created from a cloned pattern works.
1591 // (Jitterbug 3423)
1592 //
1593 {
1594 UErrorCode status = U_ZERO_ERROR;
1595 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1596 RegexPattern *pClone = pSource->clone();
1597 delete pSource;
1598 RegexMatcher *mFromClone = pClone->matcher(status);
1599 REGEX_CHECK_STATUS;
1600 UnicodeString s = "Hello World";
1601 mFromClone->reset(s);
1602 REGEX_ASSERT(mFromClone->find() == TRUE);
1603 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1604 REGEX_ASSERT(mFromClone->find() == TRUE);
1605 REGEX_ASSERT(mFromClone->group(status) == "World");
1606 REGEX_ASSERT(mFromClone->find() == FALSE);
1607 delete mFromClone;
1608 delete pClone;
1609 }
1610
1611 //
1612 // matches convenience API
1613 //
1614 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1615 REGEX_CHECK_STATUS;
1616 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1617 REGEX_CHECK_STATUS;
1618 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1619 REGEX_CHECK_STATUS;
1620 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1621 REGEX_CHECK_STATUS;
1622 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1623 REGEX_CHECK_STATUS;
1624 status = U_INDEX_OUTOFBOUNDS_ERROR;
1625 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1626 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1627
1628
1629 //
1630 // Split()
1631 //
1632 status = U_ZERO_ERROR;
1633 pat1 = RegexPattern::compile(" +", pe, status);
1634 REGEX_CHECK_STATUS;
1635 UnicodeString fields[10];
1636
1637 int32_t n;
1638 n = pat1->split("Now is the time", fields, 10, status);
1639 REGEX_CHECK_STATUS;
1640 REGEX_ASSERT(n==4);
1641 REGEX_ASSERT(fields[0]=="Now");
1642 REGEX_ASSERT(fields[1]=="is");
1643 REGEX_ASSERT(fields[2]=="the");
1644 REGEX_ASSERT(fields[3]=="time");
1645 REGEX_ASSERT(fields[4]=="");
1646
1647 n = pat1->split("Now is the time", fields, 2, status);
1648 REGEX_CHECK_STATUS;
1649 REGEX_ASSERT(n==2);
1650 REGEX_ASSERT(fields[0]=="Now");
1651 REGEX_ASSERT(fields[1]=="is the time");
1652 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1653
1654 fields[1] = "*";
1655 status = U_ZERO_ERROR;
1656 n = pat1->split("Now is the time", fields, 1, status);
1657 REGEX_CHECK_STATUS;
1658 REGEX_ASSERT(n==1);
1659 REGEX_ASSERT(fields[0]=="Now is the time");
1660 REGEX_ASSERT(fields[1]=="*");
1661 status = U_ZERO_ERROR;
1662
1663 n = pat1->split(" Now is the time ", fields, 10, status);
1664 REGEX_CHECK_STATUS;
1665 REGEX_ASSERT(n==6);
1666 REGEX_ASSERT(fields[0]=="");
1667 REGEX_ASSERT(fields[1]=="Now");
1668 REGEX_ASSERT(fields[2]=="is");
1669 REGEX_ASSERT(fields[3]=="the");
1670 REGEX_ASSERT(fields[4]=="time");
1671 REGEX_ASSERT(fields[5]=="");
1672
1673 n = pat1->split(" ", fields, 10, status);
1674 REGEX_CHECK_STATUS;
1675 REGEX_ASSERT(n==2);
1676 REGEX_ASSERT(fields[0]=="");
1677 REGEX_ASSERT(fields[1]=="");
1678
1679 fields[0] = "foo";
1680 n = pat1->split("", fields, 10, status);
1681 REGEX_CHECK_STATUS;
1682 REGEX_ASSERT(n==0);
1683 REGEX_ASSERT(fields[0]=="foo");
1684
1685 delete pat1;
1686
1687 // split, with a pattern with (capture)
1688 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1689 REGEX_CHECK_STATUS;
1690
1691 status = U_ZERO_ERROR;
1692 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1693 REGEX_CHECK_STATUS;
1694 REGEX_ASSERT(n==7);
1695 REGEX_ASSERT(fields[0]=="");
1696 REGEX_ASSERT(fields[1]=="a");
1697 REGEX_ASSERT(fields[2]=="Now is ");
1698 REGEX_ASSERT(fields[3]=="b");
1699 REGEX_ASSERT(fields[4]=="the time");
1700 REGEX_ASSERT(fields[5]=="c");
1701 REGEX_ASSERT(fields[6]=="");
1702 REGEX_ASSERT(status==U_ZERO_ERROR);
1703
1704 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1705 REGEX_CHECK_STATUS;
1706 REGEX_ASSERT(n==7);
1707 REGEX_ASSERT(fields[0]==" ");
1708 REGEX_ASSERT(fields[1]=="a");
1709 REGEX_ASSERT(fields[2]=="Now is ");
1710 REGEX_ASSERT(fields[3]=="b");
1711 REGEX_ASSERT(fields[4]=="the time");
1712 REGEX_ASSERT(fields[5]=="c");
1713 REGEX_ASSERT(fields[6]=="");
1714
1715 status = U_ZERO_ERROR;
1716 fields[6] = "foo";
1717 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1718 REGEX_CHECK_STATUS;
1719 REGEX_ASSERT(n==6);
1720 REGEX_ASSERT(fields[0]==" ");
1721 REGEX_ASSERT(fields[1]=="a");
1722 REGEX_ASSERT(fields[2]=="Now is ");
1723 REGEX_ASSERT(fields[3]=="b");
1724 REGEX_ASSERT(fields[4]=="the time");
1725 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1726 REGEX_ASSERT(fields[6]=="foo");
1727
1728 status = U_ZERO_ERROR;
1729 fields[5] = "foo";
1730 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1731 REGEX_CHECK_STATUS;
1732 REGEX_ASSERT(n==5);
1733 REGEX_ASSERT(fields[0]==" ");
1734 REGEX_ASSERT(fields[1]=="a");
1735 REGEX_ASSERT(fields[2]=="Now is ");
1736 REGEX_ASSERT(fields[3]=="b");
1737 REGEX_ASSERT(fields[4]=="the time<c>");
1738 REGEX_ASSERT(fields[5]=="foo");
1739
1740 status = U_ZERO_ERROR;
1741 fields[5] = "foo";
1742 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1743 REGEX_CHECK_STATUS;
1744 REGEX_ASSERT(n==5);
1745 REGEX_ASSERT(fields[0]==" ");
1746 REGEX_ASSERT(fields[1]=="a");
1747 REGEX_ASSERT(fields[2]=="Now is ");
1748 REGEX_ASSERT(fields[3]=="b");
1749 REGEX_ASSERT(fields[4]=="the time");
1750 REGEX_ASSERT(fields[5]=="foo");
1751
1752 status = U_ZERO_ERROR;
1753 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1754 REGEX_CHECK_STATUS;
1755 REGEX_ASSERT(n==4);
1756 REGEX_ASSERT(fields[0]==" ");
1757 REGEX_ASSERT(fields[1]=="a");
1758 REGEX_ASSERT(fields[2]=="Now is ");
1759 REGEX_ASSERT(fields[3]=="the time<c>");
1760 status = U_ZERO_ERROR;
1761 delete pat1;
1762
1763 pat1 = RegexPattern::compile("([-,])", pe, status);
1764 REGEX_CHECK_STATUS;
1765 n = pat1->split("1-10,20", fields, 10, status);
1766 REGEX_CHECK_STATUS;
1767 REGEX_ASSERT(n==5);
1768 REGEX_ASSERT(fields[0]=="1");
1769 REGEX_ASSERT(fields[1]=="-");
1770 REGEX_ASSERT(fields[2]=="10");
1771 REGEX_ASSERT(fields[3]==",");
1772 REGEX_ASSERT(fields[4]=="20");
1773 delete pat1;
1774
1775 // Test split of string with empty trailing fields
1776 pat1 = RegexPattern::compile(",", pe, status);
1777 REGEX_CHECK_STATUS;
1778 n = pat1->split("a,b,c,", fields, 10, status);
1779 REGEX_CHECK_STATUS;
1780 REGEX_ASSERT(n==4);
1781 REGEX_ASSERT(fields[0]=="a");
1782 REGEX_ASSERT(fields[1]=="b");
1783 REGEX_ASSERT(fields[2]=="c");
1784 REGEX_ASSERT(fields[3]=="");
1785
1786 n = pat1->split("a,,,", fields, 10, status);
1787 REGEX_CHECK_STATUS;
1788 REGEX_ASSERT(n==4);
1789 REGEX_ASSERT(fields[0]=="a");
1790 REGEX_ASSERT(fields[1]=="");
1791 REGEX_ASSERT(fields[2]=="");
1792 REGEX_ASSERT(fields[3]=="");
1793 delete pat1;
1794
1795 // Split Separator with zero length match.
1796 pat1 = RegexPattern::compile(":?", pe, status);
1797 REGEX_CHECK_STATUS;
1798 n = pat1->split("abc", fields, 10, status);
1799 REGEX_CHECK_STATUS;
1800 REGEX_ASSERT(n==5);
1801 REGEX_ASSERT(fields[0]=="");
1802 REGEX_ASSERT(fields[1]=="a");
1803 REGEX_ASSERT(fields[2]=="b");
1804 REGEX_ASSERT(fields[3]=="c");
1805 REGEX_ASSERT(fields[4]=="");
1806
1807 delete pat1;
1808
1809 //
1810 // RegexPattern::pattern()
1811 //
1812 pat1 = new RegexPattern();
1813 REGEX_ASSERT(pat1->pattern() == "");
1814 delete pat1;
1815
1816 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1817 REGEX_CHECK_STATUS;
1818 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1819 delete pat1;
1820
1821
1822 //
1823 // classID functions
1824 //
1825 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1826 REGEX_CHECK_STATUS;
1827 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1828 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1829 UnicodeString Hello("Hello, world.");
1830 RegexMatcher *m = pat1->matcher(Hello, status);
1831 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1832 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1833 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1834 delete m;
1835 delete pat1;
1836
1837}
1838
1839//---------------------------------------------------------------------------
1840//
1841// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1842// is present and working, but excluding functions
1843// implementing replace operations.
1844//
1845//---------------------------------------------------------------------------
1846void RegexTest::API_Match_UTF8() {
1847 UParseError pe;
1848 UErrorCode status=U_ZERO_ERROR;
1849 int32_t flags = 0;
1850
1851 //
1852 // Debug - slide failing test cases early
1853 //
1854#if 0
1855 {
1856 }
1857 return;
1858#endif
1859
1860 //
1861 // Simple pattern compilation
1862 //
1863 {
1864 UText re = UTEXT_INITIALIZER;
1865 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1866 REGEX_VERBOSE_TEXT(&re);
1867 RegexPattern *pat2;
1868 pat2 = RegexPattern::compile(&re, flags, pe, status);
1869 REGEX_CHECK_STATUS;
1870
1871 UText input1 = UTEXT_INITIALIZER;
1872 UText input2 = UTEXT_INITIALIZER;
1873 UText empty = UTEXT_INITIALIZER;
1874 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1875 REGEX_VERBOSE_TEXT(&input1);
1876 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1877 REGEX_VERBOSE_TEXT(&input2);
1878 utext_openUChars(&empty, NULL, 0, &status);
1879
1880 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1881 int32_t input2Len = strlen("not abc");
1882
1883
1884 //
1885 // Matcher creation and reset.
1886 //
1887 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1888 REGEX_CHECK_STATUS;
1889 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1890 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1891 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1892 m1->reset(&input2);
1893 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1894 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1895 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1896 m1->reset(&input1);
1897 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1898 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1899 m1->reset(&empty);
1900 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1901 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1902
1903 //
1904 // reset(pos, status)
1905 //
1906 m1->reset(&input1);
1907 m1->reset(4, status);
1908 REGEX_CHECK_STATUS;
1909 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1910 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1911
1912 m1->reset(-1, status);
1913 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1914 status = U_ZERO_ERROR;
1915
1916 m1->reset(0, status);
1917 REGEX_CHECK_STATUS;
1918 status = U_ZERO_ERROR;
1919
1920 m1->reset(input1Len-1, status);
1921 REGEX_CHECK_STATUS;
1922 status = U_ZERO_ERROR;
1923
1924 m1->reset(input1Len, status);
1925 REGEX_CHECK_STATUS;
1926 status = U_ZERO_ERROR;
1927
1928 m1->reset(input1Len+1, status);
1929 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1930 status = U_ZERO_ERROR;
1931
1932 //
1933 // match(pos, status)
1934 //
1935 m1->reset(&input2);
1936 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1937 m1->reset();
1938 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1939 m1->reset();
1940 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1941 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1942 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1943 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1944
1945 // Match() at end of string should fail, but should not
1946 // be an error.
1947 status = U_ZERO_ERROR;
1948 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1949 REGEX_CHECK_STATUS;
1950
1951 // Match beyond end of string should fail with an error.
1952 status = U_ZERO_ERROR;
1953 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1954 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1955
1956 // Successful match at end of string.
1957 {
1958 status = U_ZERO_ERROR;
1959 RegexMatcher m("A?", 0, status); // will match zero length string.
1960 REGEX_CHECK_STATUS;
1961 m.reset(&input1);
1962 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1963 REGEX_CHECK_STATUS;
1964 m.reset(&empty);
1965 REGEX_ASSERT(m.matches(0, status) == TRUE);
1966 REGEX_CHECK_STATUS;
1967 }
1968
1969
1970 //
1971 // lookingAt(pos, status)
1972 //
1973 status = U_ZERO_ERROR;
1974 m1->reset(&input2); // "not abc"
1975 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1976 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1977 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1978 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1980 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1981 status = U_ZERO_ERROR;
1982 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1983 REGEX_CHECK_STATUS;
1984 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1985 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986
1987 delete m1;
1988 delete pat2;
1989
1990 utext_close(&re);
1991 utext_close(&input1);
1992 utext_close(&input2);
1993 utext_close(&empty);
1994 }
1995
1996
1997 //
1998 // Capture Group.
1999 // RegexMatcher::start();
2000 // RegexMatcher::end();
2001 // RegexMatcher::groupCount();
2002 //
2003 {
2004 int32_t flags=0;
2005 UParseError pe;
2006 UErrorCode status=U_ZERO_ERROR;
2007 UText re=UTEXT_INITIALIZER;
2008 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2009 utext_openUTF8(&re, str_01234567_pat, -1, &status);
2010
2011 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2012 REGEX_CHECK_STATUS;
2013
2014 UText input = UTEXT_INITIALIZER;
2015 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2016 utext_openUTF8(&input, str_0123456789, -1, &status);
2017
2018 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2019 REGEX_CHECK_STATUS;
2020 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2021 static const int32_t matchStarts[] = {0, 2, 4, 8};
2022 static const int32_t matchEnds[] = {10, 8, 6, 10};
2023 int32_t i;
2024 for (i=0; i<4; i++) {
2025 int32_t actualStart = matcher->start(i, status);
2026 REGEX_CHECK_STATUS;
2027 if (actualStart != matchStarts[i]) {
2028 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2029 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2030 }
2031 int32_t actualEnd = matcher->end(i, status);
2032 REGEX_CHECK_STATUS;
2033 if (actualEnd != matchEnds[i]) {
2034 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2035 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2036 }
2037 }
2038
2039 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2040 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2041
2042 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2043 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2044 matcher->reset();
2045 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2046
2047 matcher->lookingAt(status);
2048
2049 UnicodeString dest;
2050 UText destText = UTEXT_INITIALIZER;
2051 utext_openUnicodeString(&destText, &dest, &status);
2052 UText *result;
2053 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2054 // Test shallow-clone API
2055 int64_t group_len;
2056 result = matcher->group((UText *)NULL, group_len, status);
2057 REGEX_CHECK_STATUS;
2058 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2059 utext_close(result);
2060 result = matcher->group(0, &destText, group_len, status);
2061 REGEX_CHECK_STATUS;
2062 REGEX_ASSERT(result == &destText);
2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064 // destText is now immutable, reopen it
2065 utext_close(&destText);
2066 utext_openUnicodeString(&destText, &dest, &status);
2067
2068 int64_t length;
2069 result = matcher->group(0, NULL, length, status);
2070 REGEX_CHECK_STATUS;
2071 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2072 utext_close(result);
2073 result = matcher->group(0, &destText, length, status);
2074 REGEX_CHECK_STATUS;
2075 REGEX_ASSERT(result == &destText);
2076 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2077 REGEX_ASSERT(length == 10);
2078 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079
2080 // Capture Group 1 == "234567"
2081 result = matcher->group(1, NULL, length, status);
2082 REGEX_CHECK_STATUS;
2083 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2084 REGEX_ASSERT(length == 6);
2085 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086 utext_close(result);
2087
2088 result = matcher->group(1, &destText, length, status);
2089 REGEX_CHECK_STATUS;
2090 REGEX_ASSERT(result == &destText);
2091 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2092 REGEX_ASSERT(length == 6);
2093 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2094 utext_close(result);
2095
2096 // Capture Group 2 == "45"
2097 result = matcher->group(2, NULL, length, status);
2098 REGEX_CHECK_STATUS;
2099 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2100 REGEX_ASSERT(length == 2);
2101 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2102 utext_close(result);
2103
2104 result = matcher->group(2, &destText, length, status);
2105 REGEX_CHECK_STATUS;
2106 REGEX_ASSERT(result == &destText);
2107 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2108 REGEX_ASSERT(length == 2);
2109 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2110 utext_close(result);
2111
2112 // Capture Group 3 == "89"
2113 result = matcher->group(3, NULL, length, status);
2114 REGEX_CHECK_STATUS;
2115 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2116 REGEX_ASSERT(length == 2);
2117 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2118 utext_close(result);
2119
2120 result = matcher->group(3, &destText, length, status);
2121 REGEX_CHECK_STATUS;
2122 REGEX_ASSERT(result == &destText);
2123 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2124 REGEX_ASSERT(length == 2);
2125 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2126 utext_close(result);
2127
2128 // Capture Group number out of range.
2129 status = U_ZERO_ERROR;
2130 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2131 status = U_ZERO_ERROR;
2132 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133 status = U_ZERO_ERROR;
2134 matcher->reset();
2135 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2136
2137 delete matcher;
2138 delete pat;
2139
2140 utext_close(&destText);
2141 utext_close(&input);
2142 utext_close(&re);
2143 }
2144
2145 //
2146 // find
2147 //
2148 {
2149 int32_t flags=0;
2150 UParseError pe;
2151 UErrorCode status=U_ZERO_ERROR;
2152 UText re=UTEXT_INITIALIZER;
2153 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2154 utext_openUTF8(&re, str_abc, -1, &status);
2155
2156 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2157 REGEX_CHECK_STATUS;
2158 UText input = UTEXT_INITIALIZER;
2159 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2160 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2161 // 012345678901234567
2162
2163 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2164 REGEX_CHECK_STATUS;
2165 REGEX_ASSERT(matcher->find());
2166 REGEX_ASSERT(matcher->start(status) == 1);
2167 REGEX_ASSERT(matcher->find());
2168 REGEX_ASSERT(matcher->start(status) == 6);
2169 REGEX_ASSERT(matcher->find());
2170 REGEX_ASSERT(matcher->start(status) == 12);
2171 REGEX_ASSERT(matcher->find() == FALSE);
2172 REGEX_ASSERT(matcher->find() == FALSE);
2173
2174 matcher->reset();
2175 REGEX_ASSERT(matcher->find());
2176 REGEX_ASSERT(matcher->start(status) == 1);
2177
2178 REGEX_ASSERT(matcher->find(0, status));
2179 REGEX_ASSERT(matcher->start(status) == 1);
2180 REGEX_ASSERT(matcher->find(1, status));
2181 REGEX_ASSERT(matcher->start(status) == 1);
2182 REGEX_ASSERT(matcher->find(2, status));
2183 REGEX_ASSERT(matcher->start(status) == 6);
2184 REGEX_ASSERT(matcher->find(12, status));
2185 REGEX_ASSERT(matcher->start(status) == 12);
2186 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2187 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2188 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2189 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2190
2191 status = U_ZERO_ERROR;
2192 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2193 status = U_ZERO_ERROR;
2194 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2195
2196 REGEX_ASSERT(matcher->groupCount() == 0);
2197
2198 delete matcher;
2199 delete pat;
2200
2201 utext_close(&input);
2202 utext_close(&re);
2203 }
2204
2205
2206 //
2207 // find, with \G in pattern (true if at the end of a previous match).
2208 //
2209 {
2210 int32_t flags=0;
2211 UParseError pe;
2212 UErrorCode status=U_ZERO_ERROR;
2213 UText re=UTEXT_INITIALIZER;
2214 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2215 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2216
2217 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2218
2219 REGEX_CHECK_STATUS;
2220 UText input = UTEXT_INITIALIZER;
2221 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2222 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2223 // 012345678901234567
2224
2225 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2226 REGEX_CHECK_STATUS;
2227 REGEX_ASSERT(matcher->find());
2228 REGEX_ASSERT(matcher->start(status) == 0);
2229 REGEX_ASSERT(matcher->start(1, status) == -1);
2230 REGEX_ASSERT(matcher->start(2, status) == 1);
2231
2232 REGEX_ASSERT(matcher->find());
2233 REGEX_ASSERT(matcher->start(status) == 4);
2234 REGEX_ASSERT(matcher->start(1, status) == 4);
2235 REGEX_ASSERT(matcher->start(2, status) == -1);
2236 REGEX_CHECK_STATUS;
2237
2238 delete matcher;
2239 delete pat;
2240
2241 utext_close(&input);
2242 utext_close(&re);
2243 }
2244
2245 //
2246 // find with zero length matches, match position should bump ahead
2247 // to prevent loops.
2248 //
2249 {
2250 int32_t i;
2251 UErrorCode status=U_ZERO_ERROR;
2252 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2253 // using an always-true look-ahead.
2254 REGEX_CHECK_STATUS;
2255 UText s = UTEXT_INITIALIZER;
2256 utext_openUTF8(&s, " ", -1, &status);
2257 m.reset(&s);
2258 for (i=0; ; i++) {
2259 if (m.find() == FALSE) {
2260 break;
2261 }
2262 REGEX_ASSERT(m.start(status) == i);
2263 REGEX_ASSERT(m.end(status) == i);
2264 }
2265 REGEX_ASSERT(i==5);
2266
2267 // Check that the bump goes over characters outside the BMP OK
2268 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2269 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2270 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2271 m.reset(&s);
2272 for (i=0; ; i+=4) {
2273 if (m.find() == FALSE) {
2274 break;
2275 }
2276 REGEX_ASSERT(m.start(status) == i);
2277 REGEX_ASSERT(m.end(status) == i);
2278 }
2279 REGEX_ASSERT(i==20);
2280
2281 utext_close(&s);
2282 }
2283 {
2284 // find() loop breaking test.
2285 // with pattern of /.?/, should see a series of one char matches, then a single
2286 // match of zero length at the end of the input string.
2287 int32_t i;
2288 UErrorCode status=U_ZERO_ERROR;
2289 RegexMatcher m(".?", 0, status);
2290 REGEX_CHECK_STATUS;
2291 UText s = UTEXT_INITIALIZER;
2292 utext_openUTF8(&s, " ", -1, &status);
2293 m.reset(&s);
2294 for (i=0; ; i++) {
2295 if (m.find() == FALSE) {
2296 break;
2297 }
2298 REGEX_ASSERT(m.start(status) == i);
2299 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2300 }
2301 REGEX_ASSERT(i==5);
2302
2303 utext_close(&s);
2304 }
2305
2306
2307 //
2308 // Matchers with no input string behave as if they had an empty input string.
2309 //
2310
2311 {
2312 UErrorCode status = U_ZERO_ERROR;
2313 RegexMatcher m(".?", 0, status);
2314 REGEX_CHECK_STATUS;
2315 REGEX_ASSERT(m.find());
2316 REGEX_ASSERT(m.start(status) == 0);
2317 REGEX_ASSERT(m.input() == "");
2318 }
2319 {
2320 UErrorCode status = U_ZERO_ERROR;
2321 RegexPattern *p = RegexPattern::compile(".", 0, status);
2322 RegexMatcher *m = p->matcher(status);
2323 REGEX_CHECK_STATUS;
2324
2325 REGEX_ASSERT(m->find() == FALSE);
2326 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2327 delete m;
2328 delete p;
2329 }
2330
2331 //
2332 // Regions
2333 //
2334 {
2335 UErrorCode status = U_ZERO_ERROR;
2336 UText testPattern = UTEXT_INITIALIZER;
2337 UText testText = UTEXT_INITIALIZER;
2338 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2339 REGEX_VERBOSE_TEXT(&testPattern);
2340 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2341 REGEX_VERBOSE_TEXT(&testText);
2342
2343 RegexMatcher m(&testPattern, &testText, 0, status);
2344 REGEX_CHECK_STATUS;
2345 REGEX_ASSERT(m.regionStart() == 0);
2346 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2347 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2348 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2349
2350 m.region(2,4, status);
2351 REGEX_CHECK_STATUS;
2352 REGEX_ASSERT(m.matches(status));
2353 REGEX_ASSERT(m.start(status)==2);
2354 REGEX_ASSERT(m.end(status)==4);
2355 REGEX_CHECK_STATUS;
2356
2357 m.reset();
2358 REGEX_ASSERT(m.regionStart() == 0);
2359 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2360
2361 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2362 REGEX_VERBOSE_TEXT(&testText);
2363 m.reset(&testText);
2364 REGEX_ASSERT(m.regionStart() == 0);
2365 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2366
2367 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2368 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2369 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2370 REGEX_ASSERT(&m == &m.reset());
2371 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2372
2373 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2374 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2375 REGEX_ASSERT(&m == &m.reset());
2376 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2377
2378 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2379 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2380 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2381 REGEX_ASSERT(&m == &m.reset());
2382 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2383
2384 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2385 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2386 REGEX_ASSERT(&m == &m.reset());
2387 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2388
2389 utext_close(&testText);
2390 utext_close(&testPattern);
2391 }
2392
2393 //
2394 // hitEnd() and requireEnd()
2395 //
2396 {
2397 UErrorCode status = U_ZERO_ERROR;
2398 UText testPattern = UTEXT_INITIALIZER;
2399 UText testText = UTEXT_INITIALIZER;
2400 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2401 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2402 utext_openUTF8(&testPattern, str_, -1, &status);
2403 utext_openUTF8(&testText, str_aabb, -1, &status);
2404
2405 RegexMatcher m1(&testPattern, &testText, 0, status);
2406 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2407 REGEX_ASSERT(m1.hitEnd() == TRUE);
2408 REGEX_ASSERT(m1.requireEnd() == FALSE);
2409 REGEX_CHECK_STATUS;
2410
2411 status = U_ZERO_ERROR;
2412 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2413 utext_openUTF8(&testPattern, str_a, -1, &status);
2414 RegexMatcher m2(&testPattern, &testText, 0, status);
2415 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2416 REGEX_ASSERT(m2.hitEnd() == FALSE);
2417 REGEX_ASSERT(m2.requireEnd() == FALSE);
2418 REGEX_CHECK_STATUS;
2419
2420 status = U_ZERO_ERROR;
2421 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2422 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2423 RegexMatcher m3(&testPattern, &testText, 0, status);
2424 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2425 REGEX_ASSERT(m3.hitEnd() == TRUE);
2426 REGEX_ASSERT(m3.requireEnd() == TRUE);
2427 REGEX_CHECK_STATUS;
2428
2429 utext_close(&testText);
2430 utext_close(&testPattern);
2431 }
2432}
2433
2434
2435//---------------------------------------------------------------------------
2436//
2437// API_Replace_UTF8 API test for class RegexMatcher, testing the
2438// Replace family of functions.
2439//
2440//---------------------------------------------------------------------------
2441void RegexTest::API_Replace_UTF8() {
2442 //
2443 // Replace
2444 //
2445 int32_t flags=0;
2446 UParseError pe;
2447 UErrorCode status=U_ZERO_ERROR;
2448
2449 UText re=UTEXT_INITIALIZER;
2450 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2451 REGEX_VERBOSE_TEXT(&re);
2452 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2453 REGEX_CHECK_STATUS;
2454
2455 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2456 // 012345678901234567
2457 UText dataText = UTEXT_INITIALIZER;
2458 utext_openUTF8(&dataText, data, -1, &status);
2459 REGEX_CHECK_STATUS;
2460 REGEX_VERBOSE_TEXT(&dataText);
2461 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2462
2463 //
2464 // Plain vanilla matches.
2465 //
2466 UnicodeString dest;
2467 UText destText = UTEXT_INITIALIZER;
2468 utext_openUnicodeString(&destText, &dest, &status);
2469 UText *result;
2470
2471 UText replText = UTEXT_INITIALIZER;
2472
2473 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2474 utext_openUTF8(&replText, str_yz, -1, &status);
2475 REGEX_VERBOSE_TEXT(&replText);
2476 result = matcher->replaceFirst(&replText, NULL, status);
2477 REGEX_CHECK_STATUS;
2478 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2479 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2480 utext_close(result);
2481 result = matcher->replaceFirst(&replText, &destText, status);
2482 REGEX_CHECK_STATUS;
2483 REGEX_ASSERT(result == &destText);
2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485
2486 result = matcher->replaceAll(&replText, NULL, status);
2487 REGEX_CHECK_STATUS;
2488 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2489 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2490 utext_close(result);
2491
2492 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2493 result = matcher->replaceAll(&replText, &destText, status);
2494 REGEX_CHECK_STATUS;
2495 REGEX_ASSERT(result == &destText);
2496 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2497
2498 //
2499 // Plain vanilla non-matches.
2500 //
2501 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2502 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2503 matcher->reset(&dataText);
2504
2505 result = matcher->replaceFirst(&replText, NULL, status);
2506 REGEX_CHECK_STATUS;
2507 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2508 utext_close(result);
2509 result = matcher->replaceFirst(&replText, &destText, status);
2510 REGEX_CHECK_STATUS;
2511 REGEX_ASSERT(result == &destText);
2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513
2514 result = matcher->replaceAll(&replText, NULL, status);
2515 REGEX_CHECK_STATUS;
2516 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2517 utext_close(result);
2518 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2519 result = matcher->replaceAll(&replText, &destText, status);
2520 REGEX_CHECK_STATUS;
2521 REGEX_ASSERT(result == &destText);
2522 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2523
2524 //
2525 // Empty source string
2526 //
2527 utext_openUTF8(&dataText, NULL, 0, &status);
2528 matcher->reset(&dataText);
2529
2530 result = matcher->replaceFirst(&replText, NULL, status);
2531 REGEX_CHECK_STATUS;
2532 REGEX_ASSERT_UTEXT_UTF8("", result);
2533 utext_close(result);
2534 result = matcher->replaceFirst(&replText, &destText, status);
2535 REGEX_CHECK_STATUS;
2536 REGEX_ASSERT(result == &destText);
2537 REGEX_ASSERT_UTEXT_UTF8("", result);
2538
2539 result = matcher->replaceAll(&replText, NULL, status);
2540 REGEX_CHECK_STATUS;
2541 REGEX_ASSERT_UTEXT_UTF8("", result);
2542 utext_close(result);
2543 result = matcher->replaceAll(&replText, &destText, status);
2544 REGEX_CHECK_STATUS;
2545 REGEX_ASSERT(result == &destText);
2546 REGEX_ASSERT_UTEXT_UTF8("", result);
2547
2548 //
2549 // Empty substitution string
2550 //
2551 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2552 matcher->reset(&dataText);
2553
2554 utext_openUTF8(&replText, NULL, 0, &status);
2555 result = matcher->replaceFirst(&replText, NULL, status);
2556 REGEX_CHECK_STATUS;
2557 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2558 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2559 utext_close(result);
2560 result = matcher->replaceFirst(&replText, &destText, status);
2561 REGEX_CHECK_STATUS;
2562 REGEX_ASSERT(result == &destText);
2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564
2565 result = matcher->replaceAll(&replText, NULL, status);
2566 REGEX_CHECK_STATUS;
2567 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2568 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2569 utext_close(result);
2570 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2571 result = matcher->replaceAll(&replText, &destText, status);
2572 REGEX_CHECK_STATUS;
2573 REGEX_ASSERT(result == &destText);
2574 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2575
2576 //
2577 // match whole string
2578 //
2579 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2580 utext_openUTF8(&dataText, str_abc, -1, &status);
2581 matcher->reset(&dataText);
2582
2583 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2584 utext_openUTF8(&replText, str_xyz, -1, &status);
2585 result = matcher->replaceFirst(&replText, NULL, status);
2586 REGEX_CHECK_STATUS;
2587 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2588 utext_close(result);
2589 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2590 result = matcher->replaceFirst(&replText, &destText, status);
2591 REGEX_CHECK_STATUS;
2592 REGEX_ASSERT(result == &destText);
2593 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2594
2595 result = matcher->replaceAll(&replText, NULL, status);
2596 REGEX_CHECK_STATUS;
2597 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2598 utext_close(result);
2599 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2600 result = matcher->replaceAll(&replText, &destText, status);
2601 REGEX_CHECK_STATUS;
2602 REGEX_ASSERT(result == &destText);
2603 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2604
2605 //
2606 // Capture Group, simple case
2607 //
2608 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2609 utext_openUTF8(&re, str_add, -1, &status);
2610 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2611 REGEX_CHECK_STATUS;
2612
2613 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2614 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2615 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2616 REGEX_CHECK_STATUS;
2617
2618 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2619 utext_openUTF8(&replText, str_11, -1, &status);
2620 result = matcher2->replaceFirst(&replText, NULL, status);
2621 REGEX_CHECK_STATUS;
2622 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2623 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2624 utext_close(result);
2625 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2626 result = matcher2->replaceFirst(&replText, &destText, status);
2627 REGEX_CHECK_STATUS;
2628 REGEX_ASSERT(result == &destText);
2629 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2630
2631 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2632 utext_openUTF8(&replText, str_v, -1, &status);
2633 REGEX_VERBOSE_TEXT(&replText);
2634 result = matcher2->replaceFirst(&replText, NULL, status);
2635 REGEX_CHECK_STATUS;
2636 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2637 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2638 utext_close(result);
2639 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2640 result = matcher2->replaceFirst(&replText, &destText, status);
2641 REGEX_CHECK_STATUS;
2642 REGEX_ASSERT(result == &destText);
2643 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2644
2645 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2646 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2647 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2648 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2649 result = matcher2->replaceFirst(&replText, NULL, status);
2650 REGEX_CHECK_STATUS;
2651 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2652 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2653 utext_close(result);
2654 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2655 result = matcher2->replaceFirst(&replText, &destText, status);
2656 REGEX_CHECK_STATUS;
2657 REGEX_ASSERT(result == &destText);
2658 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2659
2660 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2661 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2662 // 012345678901234567890123456
2663 supplDigitChars[22] = 0xF0;
2664 supplDigitChars[23] = 0x9D;
2665 supplDigitChars[24] = 0x9F;
2666 supplDigitChars[25] = 0x8F;
2667 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2668
2669 result = matcher2->replaceFirst(&replText, NULL, status);
2670 REGEX_CHECK_STATUS;
2671 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2672 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2673 utext_close(result);
2674 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2675 result = matcher2->replaceFirst(&replText, &destText, status);
2676 REGEX_CHECK_STATUS;
2677 REGEX_ASSERT(result == &destText);
2678 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2679 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2680 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2681 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2682// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2683 utext_close(result);
2684 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2685 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2686 REGEX_ASSERT(result == &destText);
2687// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688
2689 //
2690 // Replacement String with \u hex escapes
2691 //
2692 {
2693 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2694 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2695 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2696 utext_openUTF8(&replText, str_u0043, -1, &status);
2697 matcher->reset(&dataText);
2698
2699 result = matcher->replaceAll(&replText, NULL, status);
2700 REGEX_CHECK_STATUS;
2701 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2702 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2703 utext_close(result);
2704 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2705 result = matcher->replaceAll(&replText, &destText, status);
2706 REGEX_CHECK_STATUS;
2707 REGEX_ASSERT(result == &destText);
2708 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2709 }
2710 {
2711 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2712 utext_openUTF8(&dataText, str_abc, -1, &status);
2713 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2714 utext_openUTF8(&replText, str_U00010000, -1, &status);
2715 matcher->reset(&dataText);
2716
2717 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2718 // 0123456789
2719 expected[2] = 0xF0;
2720 expected[3] = 0x90;
2721 expected[4] = 0x80;
2722 expected[5] = 0x80;
2723
2724 result = matcher->replaceAll(&replText, NULL, status);
2725 REGEX_CHECK_STATUS;
2726 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2727 utext_close(result);
2728 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2729 result = matcher->replaceAll(&replText, &destText, status);
2730 REGEX_CHECK_STATUS;
2731 REGEX_ASSERT(result == &destText);
2732 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2733 }
2734 // TODO: need more through testing of capture substitutions.
2735
2736 // Bug 4057
2737 //
2738 {
2739 status = U_ZERO_ERROR;
2740const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2741const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2742const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2743 utext_openUTF8(&re, str_ssee, -1, &status);
2744 utext_openUTF8(&dataText, str_blah, -1, &status);
2745 utext_openUTF8(&replText, str_ooh, -1, &status);
2746
2747 RegexMatcher m(&re, 0, status);
2748 REGEX_CHECK_STATUS;
2749
2750 UnicodeString result;
2751 UText resultText = UTEXT_INITIALIZER;
2752 utext_openUnicodeString(&resultText, &result, &status);
2753
2754 // Multiple finds do NOT bump up the previous appendReplacement postion.
2755 m.reset(&dataText);
2756 m.find();
2757 m.find();
2758 m.appendReplacement(&resultText, &replText, status);
2759 REGEX_CHECK_STATUS;
2760 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2762
2763 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2764 status = U_ZERO_ERROR;
2765 result.truncate(0);
2766 utext_openUnicodeString(&resultText, &result, &status);
2767 m.reset(10, status);
2768 m.find();
2769 m.find();
2770 m.appendReplacement(&resultText, &replText, status);
2771 REGEX_CHECK_STATUS;
2772 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2773 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2774
2775 // find() at interior of string, appendReplacement still starts at beginning.
2776 status = U_ZERO_ERROR;
2777 result.truncate(0);
2778 utext_openUnicodeString(&resultText, &result, &status);
2779 m.reset();
2780 m.find(10, status);
2781 m.find();
2782 m.appendReplacement(&resultText, &replText, status);
2783 REGEX_CHECK_STATUS;
2784 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2785 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2786
2787 m.appendTail(&resultText, status);
2788 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2789 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2790
2791 utext_close(&resultText);
2792 }
2793
2794 delete matcher2;
2795 delete pat2;
2796 delete matcher;
2797 delete pat;
2798
2799 utext_close(&dataText);
2800 utext_close(&replText);
2801 utext_close(&destText);
2802 utext_close(&re);
2803}
2804
2805
2806//---------------------------------------------------------------------------
2807//
2808// API_Pattern_UTF8 Test that the API for class RegexPattern is
2809// present and nominally working.
2810//
2811//---------------------------------------------------------------------------
2812void RegexTest::API_Pattern_UTF8() {
2813 RegexPattern pata; // Test default constructor to not crash.
2814 RegexPattern patb;
2815
2816 REGEX_ASSERT(pata == patb);
2817 REGEX_ASSERT(pata == pata);
2818
2819 UText re1 = UTEXT_INITIALIZER;
2820 UText re2 = UTEXT_INITIALIZER;
2821 UErrorCode status = U_ZERO_ERROR;
2822 UParseError pe;
2823
2824 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2825 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2826 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2827 utext_openUTF8(&re2, str_def, -1, &status);
2828
2829 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2830 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2831 REGEX_CHECK_STATUS;
2832 REGEX_ASSERT(*pat1 == *pat1);
2833 REGEX_ASSERT(*pat1 != pata);
2834
2835 // Assign
2836 patb = *pat1;
2837 REGEX_ASSERT(patb == *pat1);
2838
2839 // Copy Construct
2840 RegexPattern patc(*pat1);
2841 REGEX_ASSERT(patc == *pat1);
2842 REGEX_ASSERT(patb == patc);
2843 REGEX_ASSERT(pat1 != pat2);
2844 patb = *pat2;
2845 REGEX_ASSERT(patb != patc);
2846 REGEX_ASSERT(patb == *pat2);
2847
2848 // Compile with no flags.
2849 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2850 REGEX_ASSERT(*pat1a == *pat1);
2851
2852 REGEX_ASSERT(pat1a->flags() == 0);
2853
2854 // Compile with different flags should be not equal
2855 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2856 REGEX_CHECK_STATUS;
2857
2858 REGEX_ASSERT(*pat1b != *pat1a);
2859 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2860 REGEX_ASSERT(pat1a->flags() == 0);
2861 delete pat1b;
2862
2863 // clone
2864 RegexPattern *pat1c = pat1->clone();
2865 REGEX_ASSERT(*pat1c == *pat1);
2866 REGEX_ASSERT(*pat1c != *pat2);
2867
2868 delete pat1c;
2869 delete pat1a;
2870 delete pat1;
2871 delete pat2;
2872
2873 utext_close(&re1);
2874 utext_close(&re2);
2875
2876
2877 //
2878 // Verify that a matcher created from a cloned pattern works.
2879 // (Jitterbug 3423)
2880 //
2881 {
2882 UErrorCode status = U_ZERO_ERROR;
2883 UText pattern = UTEXT_INITIALIZER;
2884 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2885 utext_openUTF8(&pattern, str_pL, -1, &status);
2886
2887 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2888 RegexPattern *pClone = pSource->clone();
2889 delete pSource;
2890 RegexMatcher *mFromClone = pClone->matcher(status);
2891 REGEX_CHECK_STATUS;
2892
2893 UText input = UTEXT_INITIALIZER;
2894 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2895 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2896 mFromClone->reset(&input);
2897 REGEX_ASSERT(mFromClone->find() == TRUE);
2898 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2899 REGEX_ASSERT(mFromClone->find() == TRUE);
2900 REGEX_ASSERT(mFromClone->group(status) == "World");
2901 REGEX_ASSERT(mFromClone->find() == FALSE);
2902 delete mFromClone;
2903 delete pClone;
2904
2905 utext_close(&input);
2906 utext_close(&pattern);
2907 }
2908
2909 //
2910 // matches convenience API
2911 //
2912 {
2913 UErrorCode status = U_ZERO_ERROR;
2914 UText pattern = UTEXT_INITIALIZER;
2915 UText input = UTEXT_INITIALIZER;
2916
2917 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2918 utext_openUTF8(&input, str_randominput, -1, &status);
2919
2920 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2921 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2922 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2923 REGEX_CHECK_STATUS;
2924
2925 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2926 utext_openUTF8(&pattern, str_abc, -1, &status);
2927 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2928 REGEX_CHECK_STATUS;
2929
2930 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2931 utext_openUTF8(&pattern, str_nput, -1, &status);
2932 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2933 REGEX_CHECK_STATUS;
2934
2935 utext_openUTF8(&pattern, str_randominput, -1, &status);
2936 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2937 REGEX_CHECK_STATUS;
2938
2939 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2940 utext_openUTF8(&pattern, str_u, -1, &status);
2941 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2942 REGEX_CHECK_STATUS;
2943
2944 utext_openUTF8(&input, str_abc, -1, &status);
2945 utext_openUTF8(&pattern, str_abc, -1, &status);
2946 status = U_INDEX_OUTOFBOUNDS_ERROR;
2947 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2948 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2949
2950 utext_close(&input);
2951 utext_close(&pattern);
2952 }
2953
2954
2955 //
2956 // Split()
2957 //
2958 status = U_ZERO_ERROR;
2959 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2960 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2961 pat1 = RegexPattern::compile(&re1, pe, status);
2962 REGEX_CHECK_STATUS;
2963 UnicodeString fields[10];
2964
2965 int32_t n;
2966 n = pat1->split("Now is the time", fields, 10, status);
2967 REGEX_CHECK_STATUS;
2968 REGEX_ASSERT(n==4);
2969 REGEX_ASSERT(fields[0]=="Now");
2970 REGEX_ASSERT(fields[1]=="is");
2971 REGEX_ASSERT(fields[2]=="the");
2972 REGEX_ASSERT(fields[3]=="time");
2973 REGEX_ASSERT(fields[4]=="");
2974
2975 n = pat1->split("Now is the time", fields, 2, status);
2976 REGEX_CHECK_STATUS;
2977 REGEX_ASSERT(n==2);
2978 REGEX_ASSERT(fields[0]=="Now");
2979 REGEX_ASSERT(fields[1]=="is the time");
2980 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2981
2982 fields[1] = "*";
2983 status = U_ZERO_ERROR;
2984 n = pat1->split("Now is the time", fields, 1, status);
2985 REGEX_CHECK_STATUS;
2986 REGEX_ASSERT(n==1);
2987 REGEX_ASSERT(fields[0]=="Now is the time");
2988 REGEX_ASSERT(fields[1]=="*");
2989 status = U_ZERO_ERROR;
2990
2991 n = pat1->split(" Now is the time ", fields, 10, status);
2992 REGEX_CHECK_STATUS;
2993 REGEX_ASSERT(n==6);
2994 REGEX_ASSERT(fields[0]=="");
2995 REGEX_ASSERT(fields[1]=="Now");
2996 REGEX_ASSERT(fields[2]=="is");
2997 REGEX_ASSERT(fields[3]=="the");
2998 REGEX_ASSERT(fields[4]=="time");
2999 REGEX_ASSERT(fields[5]=="");
3000 REGEX_ASSERT(fields[6]=="");
3001
3002 fields[2] = "*";
3003 n = pat1->split(" ", fields, 10, status);
3004 REGEX_CHECK_STATUS;
3005 REGEX_ASSERT(n==2);
3006 REGEX_ASSERT(fields[0]=="");
3007 REGEX_ASSERT(fields[1]=="");
3008 REGEX_ASSERT(fields[2]=="*");
3009
3010 fields[0] = "foo";
3011 n = pat1->split("", fields, 10, status);
3012 REGEX_CHECK_STATUS;
3013 REGEX_ASSERT(n==0);
3014 REGEX_ASSERT(fields[0]=="foo");
3015
3016 delete pat1;
3017
3018 // split, with a pattern with (capture)
3019 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3020 pat1 = RegexPattern::compile(&re1, pe, status);
3021 REGEX_CHECK_STATUS;
3022
3023 status = U_ZERO_ERROR;
3024 fields[6] = fields[7] = "*";
3025 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3026 REGEX_CHECK_STATUS;
3027 REGEX_ASSERT(n==7);
3028 REGEX_ASSERT(fields[0]=="");
3029 REGEX_ASSERT(fields[1]=="a");
3030 REGEX_ASSERT(fields[2]=="Now is ");
3031 REGEX_ASSERT(fields[3]=="b");
3032 REGEX_ASSERT(fields[4]=="the time");
3033 REGEX_ASSERT(fields[5]=="c");
3034 REGEX_ASSERT(fields[6]=="");
3035 REGEX_ASSERT(fields[7]=="*");
3036 REGEX_ASSERT(status==U_ZERO_ERROR);
3037
3038 fields[6] = fields[7] = "*";
3039 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3040 REGEX_CHECK_STATUS;
3041 REGEX_ASSERT(n==7);
3042 REGEX_ASSERT(fields[0]==" ");
3043 REGEX_ASSERT(fields[1]=="a");
3044 REGEX_ASSERT(fields[2]=="Now is ");
3045 REGEX_ASSERT(fields[3]=="b");
3046 REGEX_ASSERT(fields[4]=="the time");
3047 REGEX_ASSERT(fields[5]=="c");
3048 REGEX_ASSERT(fields[6]=="");
3049 REGEX_ASSERT(fields[7]=="*");
3050
3051 status = U_ZERO_ERROR;
3052 fields[6] = "foo";
3053 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3054 REGEX_CHECK_STATUS;
3055 REGEX_ASSERT(n==6);
3056 REGEX_ASSERT(fields[0]==" ");
3057 REGEX_ASSERT(fields[1]=="a");
3058 REGEX_ASSERT(fields[2]=="Now is ");
3059 REGEX_ASSERT(fields[3]=="b");
3060 REGEX_ASSERT(fields[4]=="the time");
3061 REGEX_ASSERT(fields[5]==" ");
3062 REGEX_ASSERT(fields[6]=="foo");
3063
3064 status = U_ZERO_ERROR;
3065 fields[5] = "foo";
3066 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3067 REGEX_CHECK_STATUS;
3068 REGEX_ASSERT(n==5);
3069 REGEX_ASSERT(fields[0]==" ");
3070 REGEX_ASSERT(fields[1]=="a");
3071 REGEX_ASSERT(fields[2]=="Now is ");
3072 REGEX_ASSERT(fields[3]=="b");
3073 REGEX_ASSERT(fields[4]=="the time<c>");
3074 REGEX_ASSERT(fields[5]=="foo");
3075
3076 status = U_ZERO_ERROR;
3077 fields[5] = "foo";
3078 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3079 REGEX_CHECK_STATUS;
3080 REGEX_ASSERT(n==5);
3081 REGEX_ASSERT(fields[0]==" ");
3082 REGEX_ASSERT(fields[1]=="a");
3083 REGEX_ASSERT(fields[2]=="Now is ");
3084 REGEX_ASSERT(fields[3]=="b");
3085 REGEX_ASSERT(fields[4]=="the time");
3086 REGEX_ASSERT(fields[5]=="foo");
3087
3088 status = U_ZERO_ERROR;
3089 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3090 REGEX_CHECK_STATUS;
3091 REGEX_ASSERT(n==4);
3092 REGEX_ASSERT(fields[0]==" ");
3093 REGEX_ASSERT(fields[1]=="a");
3094 REGEX_ASSERT(fields[2]=="Now is ");
3095 REGEX_ASSERT(fields[3]=="the time<c>");
3096 status = U_ZERO_ERROR;
3097 delete pat1;
3098
3099 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3100 pat1 = RegexPattern::compile(&re1, pe, status);
3101 REGEX_CHECK_STATUS;
3102 n = pat1->split("1-10,20", fields, 10, status);
3103 REGEX_CHECK_STATUS;
3104 REGEX_ASSERT(n==5);
3105 REGEX_ASSERT(fields[0]=="1");
3106 REGEX_ASSERT(fields[1]=="-");
3107 REGEX_ASSERT(fields[2]=="10");
3108 REGEX_ASSERT(fields[3]==",");
3109 REGEX_ASSERT(fields[4]=="20");
3110 delete pat1;
3111
3112
3113 //
3114 // split of a UText based string, with library allocating output UTexts.
3115 //
3116 {
3117 status = U_ZERO_ERROR;
3118 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3119 UnicodeString stringToSplit("first:second:third");
3120 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3121 REGEX_CHECK_STATUS;
3122
3123 UText *splits[10] = {NULL};
3124 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3125 REGEX_CHECK_STATUS;
3126 REGEX_ASSERT(numFields == 5);
3127 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3128 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3129 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3130 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3131 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3132 REGEX_ASSERT(splits[5] == NULL);
3133
3134 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3135 if (splits[i]) {
3136 utext_close(splits[i]);
3137 splits[i] = NULL;
3138 }
3139 }
3140 utext_close(textToSplit);
3141 }
3142
3143
3144 //
3145 // RegexPattern::pattern() and patternText()
3146 //
3147 pat1 = new RegexPattern();
3148 REGEX_ASSERT(pat1->pattern() == "");
3149 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3150 delete pat1;
3151 const char *helloWorldInvariant = "(Hello, world)*";
3152 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3153 pat1 = RegexPattern::compile(&re1, pe, status);
3154 REGEX_CHECK_STATUS;
3155 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3156 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3157 delete pat1;
3158
3159 utext_close(&re1);
3160}
3161
3162
3163//---------------------------------------------------------------------------
3164//
3165// Extended A more thorough check for features of regex patterns
3166// The test cases are in a separate data file,
3167// source/tests/testdata/regextst.txt
3168// A description of the test data format is included in that file.
3169//
3170//---------------------------------------------------------------------------
3171
3172const char *
3173RegexTest::getPath(char buffer[2048], const char *filename) {
3174 UErrorCode status=U_ZERO_ERROR;
3175 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3176 if (U_FAILURE(status)) {
3177 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3178 return NULL;
3179 }
3180
3181 strcpy(buffer, testDataDirectory);
3182 strcat(buffer, filename);
3183 return buffer;
3184}
3185
3186void RegexTest::Extended() {
3187 char tdd[2048];
3188 const char *srcPath;
3189 UErrorCode status = U_ZERO_ERROR;
3190 int32_t lineNum = 0;
3191
3192 //
3193 // Open and read the test data file.
3194 //
3195 srcPath=getPath(tdd, "regextst.txt");
3196 if(srcPath==NULL) {
3197 return; /* something went wrong, error already output */
3198 }
3199
3200 int32_t len;
3201 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3202 if (U_FAILURE(status)) {
3203 return; /* something went wrong, error already output */
3204 }
3205
3206 //
3207 // Put the test data into a UnicodeString
3208 //
3209 UnicodeString testString(FALSE, testData, len);
3210
3211 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3212 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3213 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3214
3215 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3216 UnicodeString testPattern; // The pattern for test from the test file.
3217 UnicodeString testFlags; // the flags for a test.
3218 UnicodeString matchString; // The marked up string to be used as input
3219
3220 if (U_FAILURE(status)){
3221 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3222 delete [] testData;
3223 return;
3224 }
3225
3226 //
3227 // Loop over the test data file, once per line.
3228 //
3229 while (lineMat.find()) {
3230 lineNum++;
3231 if (U_FAILURE(status)) {
3232 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3233 }
3234
3235 status = U_ZERO_ERROR;
3236 UnicodeString testLine = lineMat.group(1, status);
3237 if (testLine.length() == 0) {
3238 continue;
3239 }
3240
3241 //
3242 // Parse the test line. Skip blank and comment only lines.
3243 // Separate out the three main fields - pattern, flags, target.
3244 //
3245
3246 commentMat.reset(testLine);
3247 if (commentMat.lookingAt(status)) {
3248 // This line is a comment, or blank.
3249 continue;
3250 }
3251
3252 //
3253 // Pull out the pattern field, remove it from the test file line.
3254 //
3255 quotedStuffMat.reset(testLine);
3256 if (quotedStuffMat.lookingAt(status)) {
3257 testPattern = quotedStuffMat.group(2, status);
3258 testLine.remove(0, quotedStuffMat.end(0, status));
3259 } else {
3260 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3261 continue;
3262 }
3263
3264
3265 //
3266 // Pull out the flags from the test file line.
3267 //
3268 flagsMat.reset(testLine);
3269 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3270 testFlags = flagsMat.group(1, status);
3271 if (flagsMat.group(2, status).length() > 0) {
3272 errln("Bad Match flag at line %d. Scanning %c\n",
3273 lineNum, flagsMat.group(2, status).charAt(0));
3274 continue;
3275 }
3276 testLine.remove(0, flagsMat.end(0, status));
3277
3278 //
3279 // Pull out the match string, as a whole.
3280 // We'll process the <tags> later.
3281 //
3282 quotedStuffMat.reset(testLine);
3283 if (quotedStuffMat.lookingAt(status)) {
3284 matchString = quotedStuffMat.group(2, status);
3285 testLine.remove(0, quotedStuffMat.end(0, status));
3286 } else {
3287 errln("Bad match string at test file line %d", lineNum);
3288 continue;
3289 }
3290
3291 //
3292 // The only thing left from the input line should be an optional trailing comment.
3293 //
3294 commentMat.reset(testLine);
3295 if (commentMat.lookingAt(status) == FALSE) {
3296 errln("Line %d: unexpected characters at end of test line.", lineNum);
3297 continue;
3298 }
3299
3300 //
3301 // Run the test
3302 //
3303 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3304 }
3305
3306 delete [] testData;
3307
3308}
3309
3310
3311
3312//---------------------------------------------------------------------------
3313//
3314// regex_find(pattern, flags, inputString, lineNumber)
3315//
3316// Function to run a single test from the Extended (data driven) tests.
3317// See file test/testdata/regextst.txt for a description of the
3318// pattern and inputString fields, and the allowed flags.
3319// lineNumber is the source line in regextst.txt of the test.
3320//
3321//---------------------------------------------------------------------------
3322
3323
3324// Set a value into a UVector at position specified by a decimal number in
3325// a UnicodeString. This is a utility function needed by the actual test function,
3326// which follows.
3327static void set(UVector &vec, int32_t val, UnicodeString index) {
3328 UErrorCode status=U_ZERO_ERROR;
3329 int32_t idx = 0;
3330 for (int32_t i=0; i<index.length(); i++) {
3331 int32_t d=u_charDigitValue(index.charAt(i));
3332 if (d<0) {return;}
3333 idx = idx*10 + d;
3334 }
3335 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3336 vec.setElementAt(val, idx);
3337}
3338
3339static void setInt(UVector &vec, int32_t val, int32_t idx) {
3340 UErrorCode status=U_ZERO_ERROR;
3341 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3342 vec.setElementAt(val, idx);
3343}
3344
3345static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3346{
3347 UBool couldFind = TRUE;
3348 UTEXT_SETNATIVEINDEX(utext, 0);
3349 int32_t i = 0;
3350 while (i < unistrOffset) {
3351 UChar32 c = UTEXT_NEXT32(utext);
3352 if (c != U_SENTINEL) {
3353 i += U16_LENGTH(c);
3354 } else {
3355 couldFind = FALSE;
3356 break;
3357 }
3358 }
3359 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3360 return couldFind;
3361}
3362
3363
3364void RegexTest::regex_find(const UnicodeString &pattern,
3365 const UnicodeString &flags,
3366 const UnicodeString &inputString,
3367 const char *srcPath,
3368 int32_t line) {
3369 UnicodeString unEscapedInput;
3370 UnicodeString deTaggedInput;
3371
3372 int32_t patternUTF8Length, inputUTF8Length;
3373 char *patternChars = NULL, *inputChars = NULL;
3374 UText patternText = UTEXT_INITIALIZER;
3375 UText inputText = UTEXT_INITIALIZER;
3376 UConverter *UTF8Converter = NULL;
3377
3378 UErrorCode status = U_ZERO_ERROR;
3379 UParseError pe;
3380 RegexPattern *parsePat = NULL;
3381 RegexMatcher *parseMatcher = NULL;
3382 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3383 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3384 UVector groupStarts(status);
3385 UVector groupEnds(status);
3386 UVector groupStartsUTF8(status);
3387 UVector groupEndsUTF8(status);
3388 UBool isMatch = FALSE, isUTF8Match = FALSE;
3389 UBool failed = FALSE;
3390 int32_t numFinds;
3391 int32_t i;
3392 UBool useMatchesFunc = FALSE;
3393 UBool useLookingAtFunc = FALSE;
3394 int32_t regionStart = -1;
3395 int32_t regionEnd = -1;
3396 int32_t regionStartUTF8 = -1;
3397 int32_t regionEndUTF8 = -1;
3398
3399
3400 //
3401 // Compile the caller's pattern
3402 //
3403 uint32_t bflags = 0;
3404 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3405 bflags |= UREGEX_CASE_INSENSITIVE;
3406 }
3407 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3408 bflags |= UREGEX_COMMENTS;
3409 }
3410 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3411 bflags |= UREGEX_DOTALL;
3412 }
3413 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3414 bflags |= UREGEX_MULTILINE;
3415 }
3416
3417 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3418 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3419 }
3420 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3421 bflags |= UREGEX_UNIX_LINES;
3422 }
3423 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3424 bflags |= UREGEX_LITERAL;
3425 }
3426
3427
3428 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3429 if (status != U_ZERO_ERROR) {
3430 #if UCONFIG_NO_BREAK_ITERATION==1
3431 // 'v' test flag means that the test pattern should not compile if ICU was configured
3432 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3433 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3434 goto cleanupAndReturn;
3435 }
3436 #endif
3437 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3438 // Expected pattern compilation error.
3439 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3440 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3441 }
3442 goto cleanupAndReturn;
3443 } else {
3444 // Unexpected pattern compilation error.
3445 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3446 goto cleanupAndReturn;
3447 }
3448 }
3449
3450 UTF8Converter = ucnv_open("UTF8", &status);
3451 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3452
3453 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3454 status = U_ZERO_ERROR; // buffer overflow
3455 patternChars = new char[patternUTF8Length+1];
3456 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3457 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3458
3459 if (status == U_ZERO_ERROR) {
3460 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3461
3462 if (status != U_ZERO_ERROR) {
3463#if UCONFIG_NO_BREAK_ITERATION==1
3464 // 'v' test flag means that the test pattern should not compile if ICU was configured
3465 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3466 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3467 goto cleanupAndReturn;
3468 }
3469#endif
3470 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3471 // Expected pattern compilation error.
3472 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3473 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3474 }
3475 goto cleanupAndReturn;
3476 } else {
3477 // Unexpected pattern compilation error.
3478 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3479 goto cleanupAndReturn;
3480 }
3481 }
3482 }
3483
3484 if (UTF8Pattern == NULL) {
3485 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3486 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3487 status = U_ZERO_ERROR;
3488 }
3489
3490 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3491 callerPattern->dumpPattern();
3492 }
3493
3494 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3495 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3496 goto cleanupAndReturn;
3497 }
3498
3499
3500 //
3501 // Number of times find() should be called on the test string, default to 1
3502 //
3503 numFinds = 1;
3504 for (i=2; i<=9; i++) {
3505 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3506 if (numFinds != 1) {
3507 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3508 goto cleanupAndReturn;
3509 }
3510 numFinds = i;
3511 }
3512 }
3513
3514 // 'M' flag. Use matches() instead of find()
3515 if (flags.indexOf((UChar)0x4d) >= 0) {
3516 useMatchesFunc = TRUE;
3517 }
3518 if (flags.indexOf((UChar)0x4c) >= 0) {
3519 useLookingAtFunc = TRUE;
3520 }
3521
3522 //
3523 // Find the tags in the input data, remove them, and record the group boundary
3524 // positions.
3525 //
3526 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3527 REGEX_CHECK_STATUS_L(line);
3528
3529 unEscapedInput = inputString.unescape();
3530 parseMatcher = parsePat->matcher(unEscapedInput, status);
3531 REGEX_CHECK_STATUS_L(line);
3532 while(parseMatcher->find()) {
3533 parseMatcher->appendReplacement(deTaggedInput, "", status);
3534 REGEX_CHECK_STATUS;
3535 UnicodeString groupNum = parseMatcher->group(2, status);
3536 if (groupNum == "r") {
3537 // <r> or </r>, a region specification within the string
3538 if (parseMatcher->group(1, status) == "/") {
3539 regionEnd = deTaggedInput.length();
3540 } else {
3541 regionStart = deTaggedInput.length();
3542 }
3543 } else {
3544 // <digits> or </digits>, a group match boundary tag.
3545 if (parseMatcher->group(1, status) == "/") {
3546 set(groupEnds, deTaggedInput.length(), groupNum);
3547 } else {
3548 set(groupStarts, deTaggedInput.length(), groupNum);
3549 }
3550 }
3551 }
3552 parseMatcher->appendTail(deTaggedInput);
3553 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3554 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3555 errln("mismatched <r> tags");
3556 failed = TRUE;
3557 goto cleanupAndReturn;
3558 }
3559
3560 //
3561 // Configure the matcher according to the flags specified with this test.
3562 //
3563 matcher = callerPattern->matcher(deTaggedInput, status);
3564 REGEX_CHECK_STATUS_L(line);
3565 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3566 matcher->setTrace(TRUE);
3567 }
3568
3569 if (UTF8Pattern != NULL) {
3570 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3571 status = U_ZERO_ERROR; // buffer overflow
3572 inputChars = new char[inputUTF8Length+1];
3573 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3574 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3575
3576 if (status == U_ZERO_ERROR) {
3577 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3578 REGEX_CHECK_STATUS_L(line);
3579 }
3580
3581 if (UTF8Matcher == NULL) {
3582 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3583 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3584 status = U_ZERO_ERROR;
3585 }
3586 }
3587
3588 //
3589 // Generate native indices for UTF8 versions of region and capture group info
3590 //
3591 if (UTF8Matcher != NULL) {
3592 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3593 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3594
3595 // Fill out the native index UVector info.
3596 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3597 for (i=0; i<groupStarts.size(); i++) {
3598 int32_t start = groupStarts.elementAti(i);
3599 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3600 if (start >= 0) {
3601 int32_t startUTF8;
3602 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3603 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3604 failed = TRUE;
3605 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3606 }
3607 setInt(groupStartsUTF8, startUTF8, i);
3608 }
3609
3610 int32_t end = groupEnds.elementAti(i);
3611 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3612 if (end >= 0) {
3613 int32_t endUTF8;
3614 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3615 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3616 failed = TRUE;
3617 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3618 }
3619 setInt(groupEndsUTF8, endUTF8, i);
3620 }
3621 }
3622 }
3623
3624 if (regionStart>=0) {
3625 matcher->region(regionStart, regionEnd, status);
3626 REGEX_CHECK_STATUS_L(line);
3627 if (UTF8Matcher != NULL) {
3628 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3629 REGEX_CHECK_STATUS_L(line);
3630 }
3631 }
3632 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3633 matcher->useAnchoringBounds(FALSE);
3634 if (UTF8Matcher != NULL) {
3635 UTF8Matcher->useAnchoringBounds(FALSE);
3636 }
3637 }
3638 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3639 matcher->useTransparentBounds(TRUE);
3640 if (UTF8Matcher != NULL) {
3641 UTF8Matcher->useTransparentBounds(TRUE);
3642 }
3643 }
3644
3645
3646
3647 //
3648 // Do a find on the de-tagged input using the caller's pattern
3649 // TODO: error on count>1 and not find().
3650 // error on both matches() and lookingAt().
3651 //
3652 for (i=0; i<numFinds; i++) {
3653 if (useMatchesFunc) {
3654 isMatch = matcher->matches(status);
3655 if (UTF8Matcher != NULL) {
3656 isUTF8Match = UTF8Matcher->matches(status);
3657 }
3658 } else if (useLookingAtFunc) {
3659 isMatch = matcher->lookingAt(status);
3660 if (UTF8Matcher != NULL) {
3661 isUTF8Match = UTF8Matcher->lookingAt(status);
3662 }
3663 } else {
3664 isMatch = matcher->find();
3665 if (UTF8Matcher != NULL) {
3666 isUTF8Match = UTF8Matcher->find();
3667 }
3668 }
3669 }
3670 matcher->setTrace(FALSE);
3671 if (U_FAILURE(status)) {
3672 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3673 }
3674
3675 //
3676 // Match up the groups from the find() with the groups from the tags
3677 //
3678
3679 // number of tags should match number of groups from find operation.
3680 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3681 // G option in test means that capture group data is not available in the
3682 // expected results, so the check needs to be suppressed.
3683 if (isMatch == FALSE && groupStarts.size() != 0) {
3684 dataerrln("Error at line %d: Match expected, but none found.", line);
3685 failed = TRUE;
3686 goto cleanupAndReturn;
3687 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3688 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3689 failed = TRUE;
3690 goto cleanupAndReturn;
3691 }
3692
3693 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694 // Only check for match / no match. Don't check capture groups.
3695 if (isMatch && groupStarts.size() == 0) {
3696 errln("Error at line %d: No match expected, but one found.", line);
3697 failed = TRUE;
3698 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3699 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3700 failed = TRUE;
3701 }
3702 goto cleanupAndReturn;
3703 }
3704
3705 REGEX_CHECK_STATUS_L(line);
3706 for (i=0; i<=matcher->groupCount(); i++) {
3707 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3708 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3709 if (matcher->start(i, status) != expectedStart) {
3710 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3711 line, i, expectedStart, matcher->start(i, status));
3712 failed = TRUE;
3713 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3714 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3715 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3716 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3717 failed = TRUE;
3718 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3719 }
3720
3721 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3722 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3723 if (matcher->end(i, status) != expectedEnd) {
3724 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3725 line, i, expectedEnd, matcher->end(i, status));
3726 failed = TRUE;
3727 // Error on end position; keep going; real error is probably yet to come as group
3728 // end positions work from end of the input data towards the front.
3729 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3730 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3731 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3732 failed = TRUE;
3733 // Error on end position; keep going; real error is probably yet to come as group
3734 // end positions work from end of the input data towards the front.
3735 }
3736 }
3737 if ( matcher->groupCount()+1 < groupStarts.size()) {
3738 errln("Error at line %d: Expected %d capture groups, found %d.",
3739 line, groupStarts.size()-1, matcher->groupCount());
3740 failed = TRUE;
3741 }
3742 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3743 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3744 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3745 failed = TRUE;
3746 }
3747
3748 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3749 matcher->requireEnd() == TRUE) {
3750 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3751 failed = TRUE;
3752 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3753 UTF8Matcher->requireEnd() == TRUE) {
3754 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3755 failed = TRUE;
3756 }
3757
3758 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3759 matcher->requireEnd() == FALSE) {
3760 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3761 failed = TRUE;
3762 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3763 UTF8Matcher->requireEnd() == FALSE) {
3764 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3765 failed = TRUE;
3766 }
3767
3768 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3769 matcher->hitEnd() == TRUE) {
3770 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3771 failed = TRUE;
3772 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3773 UTF8Matcher->hitEnd() == TRUE) {
3774 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3775 failed = TRUE;
3776 }
3777
3778 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3779 matcher->hitEnd() == FALSE) {
3780 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3781 failed = TRUE;
3782 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3783 UTF8Matcher->hitEnd() == FALSE) {
3784 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3785 failed = TRUE;
3786 }
3787
3788
3789cleanupAndReturn:
3790 if (failed) {
3791 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3792 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3793 // callerPattern->dump();
3794 }
3795 delete parseMatcher;
3796 delete parsePat;
3797 delete UTF8Matcher;
3798 delete UTF8Pattern;
3799 delete matcher;
3800 delete callerPattern;
3801
3802 utext_close(&inputText);
3803 delete[] inputChars;
3804 utext_close(&patternText);
3805 delete[] patternChars;
3806 ucnv_close(UTF8Converter);
3807}
3808
3809
3810
3811
3812//---------------------------------------------------------------------------
3813//
3814// Errors Check for error handling in patterns.
3815//
3816//---------------------------------------------------------------------------
3817void RegexTest::Errors() {
3818 // \escape sequences that aren't implemented yet.
3819 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3820
3821 // Missing close parentheses
3822 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3823 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3824 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3825
3826 // Extra close paren
3827 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3828 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3829 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3830
3831 // Look-ahead, Look-behind
3832 // TODO: add tests for unbounded length look-behinds.
3833 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3834
3835 // Attempt to use non-default flags
3836 {
3837 UParseError pe;
3838 UErrorCode status = U_ZERO_ERROR;
3839 int32_t flags = UREGEX_CANON_EQ |
3840 UREGEX_COMMENTS | UREGEX_DOTALL |
3841 UREGEX_MULTILINE;
3842 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3843 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3844 delete pat1;
3845 }
3846
3847
3848 // Quantifiers are allowed only after something that can be quantified.
3849 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3850 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3851 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3852
3853 // Mal-formed {min,max} quantifiers
3854 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3855 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3856 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3857 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3858 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3859 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3860 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3861 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3862 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3863
3864 // Ticket 5389
3865 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3866
3867 // Invalid Back Reference \0
3868 // For ICU 3.8 and earlier
3869 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3870 //
3871 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3872
3873}
3874
3875
3876//-------------------------------------------------------------------------------
3877//
3878// Read a text data file, convert it to UChars, and return the data
3879// in one big UChar * buffer, which the caller must delete.
3880//
3881//--------------------------------------------------------------------------------
3882UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3883 const char *defEncoding, UErrorCode &status) {
3884 UChar *retPtr = NULL;
3885 char *fileBuf = NULL;
3886 UConverter* conv = NULL;
3887 FILE *f = NULL;
3888
3889 ulen = 0;
3890 if (U_FAILURE(status)) {
3891 return retPtr;
3892 }
3893
3894 //
3895 // Open the file.
3896 //
3897 f = fopen(fileName, "rb");
3898 if (f == 0) {
3899 dataerrln("Error opening test data file %s\n", fileName);
3900 status = U_FILE_ACCESS_ERROR;
3901 return NULL;
3902 }
3903 //
3904 // Read it in
3905 //
3906 int32_t fileSize;
3907 int32_t amt_read;
3908
3909 fseek( f, 0, SEEK_END);
3910 fileSize = ftell(f);
3911 fileBuf = new char[fileSize];
3912 fseek(f, 0, SEEK_SET);
3913 amt_read = fread(fileBuf, 1, fileSize, f);
3914 if (amt_read != fileSize || fileSize <= 0) {
3915 errln("Error reading test data file.");
3916 goto cleanUpAndReturn;
3917 }
3918
3919 //
3920 // Look for a Unicode Signature (BOM) on the data just read
3921 //
3922 int32_t signatureLength;
3923 const char * fileBufC;
3924 const char* encoding;
3925
3926 fileBufC = fileBuf;
3927 encoding = ucnv_detectUnicodeSignature(
3928 fileBuf, fileSize, &signatureLength, &status);
3929 if(encoding!=NULL ){
3930 fileBufC += signatureLength;
3931 fileSize -= signatureLength;
3932 } else {
3933 encoding = defEncoding;
3934 if (strcmp(encoding, "utf-8") == 0) {
3935 errln("file %s is missing its BOM", fileName);
3936 }
3937 }
3938
3939 //
3940 // Open a converter to take the rule file to UTF-16
3941 //
3942 conv = ucnv_open(encoding, &status);
3943 if (U_FAILURE(status)) {
3944 goto cleanUpAndReturn;
3945 }
3946
3947 //
3948 // Convert the rules to UChar.
3949 // Preflight first to determine required buffer size.
3950 //
3951 ulen = ucnv_toUChars(conv,
3952 NULL, // dest,
3953 0, // destCapacity,
3954 fileBufC,
3955 fileSize,
3956 &status);
3957 if (status == U_BUFFER_OVERFLOW_ERROR) {
3958 // Buffer Overflow is expected from the preflight operation.
3959 status = U_ZERO_ERROR;
3960
3961 retPtr = new UChar[ulen+1];
3962 ucnv_toUChars(conv,
3963 retPtr, // dest,
3964 ulen+1,
3965 fileBufC,
3966 fileSize,
3967 &status);
3968 }
3969
3970cleanUpAndReturn:
3971 fclose(f);
3972 delete[] fileBuf;
3973 ucnv_close(conv);
3974 if (U_FAILURE(status)) {
3975 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3976 delete []retPtr;
3977 retPtr = 0;
3978 ulen = 0;
3979 };
3980 return retPtr;
3981}
3982
3983
3984//-------------------------------------------------------------------------------
3985//
3986// PerlTests - Run Perl's regular expression tests
3987// The input file for this test is re_tests, the standard regular
3988// expression test data distributed with the Perl source code.
3989//
3990// Here is Perl's description of the test data file:
3991//
3992// # The tests are in a separate file 't/op/re_tests'.
3993// # Each line in that file is a separate test.
3994// # There are five columns, separated by tabs.
3995// #
3996// # Column 1 contains the pattern, optionally enclosed in C<''>.
3997// # Modifiers can be put after the closing C<'>.
3998// #
3999// # Column 2 contains the string to be matched.
4000// #
4001// # Column 3 contains the expected result:
4002// # y expect a match
4003// # n expect no match
4004// # c expect an error
4005// # B test exposes a known bug in Perl, should be skipped
4006// # b test exposes a known bug in Perl, should be skipped if noamp
4007// #
4008// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4009// #
4010// # Column 4 contains a string, usually C<$&>.
4011// #
4012// # Column 5 contains the expected result of double-quote
4013// # interpolating that string after the match, or start of error message.
4014// #
4015// # Column 6, if present, contains a reason why the test is skipped.
4016// # This is printed with "skipped", for harness to pick up.
4017// #
4018// # \n in the tests are interpolated, as are variables of the form ${\w+}.
4019// #
4020// # If you want to add a regular expression test that can't be expressed
4021// # in this format, don't add it here: put it in op/pat.t instead.
4022//
4023// For ICU, if field 3 contains an 'i', the test will be skipped.
4024// The test exposes is some known incompatibility between ICU and Perl regexps.
4025// (The i is in addition to whatever was there before.)
4026//
4027//-------------------------------------------------------------------------------
4028void RegexTest::PerlTests() {
4029 char tdd[2048];
4030 const char *srcPath;
4031 UErrorCode status = U_ZERO_ERROR;
4032 UParseError pe;
4033
4034 //
4035 // Open and read the test data file.
4036 //
4037 srcPath=getPath(tdd, "re_tests.txt");
4038 if(srcPath==NULL) {
4039 return; /* something went wrong, error already output */
4040 }
4041
4042 int32_t len;
4043 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4044 if (U_FAILURE(status)) {
4045 return; /* something went wrong, error already output */
4046 }
4047
4048 //
4049 // Put the test data into a UnicodeString
4050 //
4051 UnicodeString testDataString(FALSE, testData, len);
4052
4053 //
4054 // Regex to break the input file into lines, and strip the new lines.
4055 // One line per match, capture group one is the desired data.
4056 //
4057 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4058 if (U_FAILURE(status)) {
4059 dataerrln("RegexPattern::compile() error");
4060 return;
4061 }
4062 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4063
4064 //
4065 // Regex to split a test file line into fields.
4066 // There are six fields, separated by tabs.
4067 //
4068 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4069
4070 //
4071 // Regex to identify test patterns with flag settings, and to separate them.
4072 // Test patterns with flags look like 'pattern'i
4073 // Test patterns without flags are not quoted: pattern
4074 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4075 //
4076 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4077 RegexMatcher* flagMat = flagPat->matcher(status);
4078
4079 //
4080 // The Perl tests reference several perl-isms, which are evaluated/substituted
4081 // in the test data. Not being perl, this must be done explicitly. Here
4082 // are string constants and REs for these constructs.
4083 //
4084 UnicodeString nulnulSrc("${nulnul}");
4085 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4086 nulnul = nulnul.unescape();
4087
4088 UnicodeString ffffSrc("${ffff}");
4089 UnicodeString ffff("\\uffff", -1, US_INV);
4090 ffff = ffff.unescape();
4091
4092 // regexp for $-[0], $+[2], etc.
4093 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4094 RegexMatcher *groupsMat = groupsPat->matcher(status);
4095
4096 // regexp for $0, $1, $2, etc.
4097 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4098 RegexMatcher *cgMat = cgPat->matcher(status);
4099
4100
4101 //
4102 // Main Loop for the Perl Tests, runs once per line from the
4103 // test data file.
4104 //
4105 int32_t lineNum = 0;
4106 int32_t skippedUnimplementedCount = 0;
4107 while (lineMat->find()) {
4108 lineNum++;
4109
4110 //
4111 // Get a line, break it into its fields, do the Perl
4112 // variable substitutions.
4113 //
4114 UnicodeString line = lineMat->group(1, status);
4115 UnicodeString fields[7];
4116 fieldPat->split(line, fields, 7, status);
4117
4118 flagMat->reset(fields[0]);
4119 flagMat->matches(status);
4120 UnicodeString pattern = flagMat->group(2, status);
4121 pattern.findAndReplace("${bang}", "!");
4122 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4123 pattern.findAndReplace(ffffSrc, ffff);
4124
4125 //
4126 // Identify patterns that include match flag settings,
4127 // split off the flags, remove the extra quotes.
4128 //
4129 UnicodeString flagStr = flagMat->group(3, status);
4130 if (U_FAILURE(status)) {
4131 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4132 return;
4133 }
4134 int32_t flags = 0;
4135 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4136 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4137 const UChar UChar_m = 0x6d;
4138 const UChar UChar_x = 0x78;
4139 const UChar UChar_y = 0x79;
4140 if (flagStr.indexOf(UChar_i) != -1) {
4141 flags |= UREGEX_CASE_INSENSITIVE;
4142 }
4143 if (flagStr.indexOf(UChar_m) != -1) {
4144 flags |= UREGEX_MULTILINE;
4145 }
4146 if (flagStr.indexOf(UChar_x) != -1) {
4147 flags |= UREGEX_COMMENTS;
4148 }
4149
4150 //
4151 // Compile the test pattern.
4152 //
4153 status = U_ZERO_ERROR;
4154 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4155 if (status == U_REGEX_UNIMPLEMENTED) {
4156 //
4157 // Test of a feature that is planned for ICU, but not yet implemented.
4158 // skip the test.
4159 skippedUnimplementedCount++;
4160 delete testPat;
4161 status = U_ZERO_ERROR;
4162 continue;
4163 }
4164
4165 if (U_FAILURE(status)) {
4166 // Some tests are supposed to generate errors.
4167 // Only report an error for tests that are supposed to succeed.
4168 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4169 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4170 {
4171 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4172 }
4173 status = U_ZERO_ERROR;
4174 delete testPat;
4175 continue;
4176 }
4177
4178 if (fields[2].indexOf(UChar_i) >= 0) {
4179 // ICU should skip this test.
4180 delete testPat;
4181 continue;
4182 }
4183
4184 if (fields[2].indexOf(UChar_c) >= 0) {
4185 // This pattern should have caused a compilation error, but didn't/
4186 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4187 delete testPat;
4188 continue;
4189 }
4190
4191 //
4192 // replace the Perl variables that appear in some of the
4193 // match data strings.
4194 //
4195 UnicodeString matchString = fields[1];
4196 matchString.findAndReplace(nulnulSrc, nulnul);
4197 matchString.findAndReplace(ffffSrc, ffff);
4198
4199 // Replace any \n in the match string with an actual new-line char.
4200 // Don't do full unescape, as this unescapes more than Perl does, which
4201 // causes other spurious failures in the tests.
4202 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4203
4204
4205
4206 //
4207 // Run the test, check for expected match/don't match result.
4208 //
4209 RegexMatcher *testMat = testPat->matcher(matchString, status);
4210 UBool found = testMat->find();
4211 UBool expected = FALSE;
4212 if (fields[2].indexOf(UChar_y) >=0) {
4213 expected = TRUE;
4214 }
4215 if (expected != found) {
4216 errln("line %d: Expected %smatch, got %smatch",
4217 lineNum, expected?"":"no ", found?"":"no " );
4218 continue;
4219 }
4220
4221 // Don't try to check expected results if there is no match.
4222 // (Some have stuff in the expected fields)
4223 if (!found) {
4224 delete testMat;
4225 delete testPat;
4226 continue;
4227 }
4228
4229 //
4230 // Interpret the Perl expression from the fourth field of the data file,
4231 // building up an ICU string from the results of the ICU match.
4232 // The Perl expression will contain references to the results of
4233 // a regex match, including the matched string, capture group strings,
4234 // group starting and ending indicies, etc.
4235 //
4236 UnicodeString resultString;
4237 UnicodeString perlExpr = fields[3];
4238#if SUPPORT_MUTATING_INPUT_STRING
4239 groupsMat->reset(perlExpr);
4240 cgMat->reset(perlExpr);
4241#endif
4242
4243 while (perlExpr.length() > 0) {
4244#if !SUPPORT_MUTATING_INPUT_STRING
4245 // Perferred usage. Reset after any modification to input string.
4246 groupsMat->reset(perlExpr);
4247 cgMat->reset(perlExpr);
4248#endif
4249
4250 if (perlExpr.startsWith("$&")) {
4251 resultString.append(testMat->group(status));
4252 perlExpr.remove(0, 2);
4253 }
4254
4255 else if (groupsMat->lookingAt(status)) {
4256 // $-[0] $+[2] etc.
4257 UnicodeString digitString = groupsMat->group(2, status);
4258 int32_t t = 0;
4259 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4260 UnicodeString plusOrMinus = groupsMat->group(1, status);
4261 int32_t matchPosition;
4262 if (plusOrMinus.compare("+") == 0) {
4263 matchPosition = testMat->end(groupNum, status);
4264 } else {
4265 matchPosition = testMat->start(groupNum, status);
4266 }
4267 if (matchPosition != -1) {
4268 ICU_Utility::appendNumber(resultString, matchPosition);
4269 }
4270 perlExpr.remove(0, groupsMat->end(status));
4271 }
4272
4273 else if (cgMat->lookingAt(status)) {
4274 // $1, $2, $3, etc.
4275 UnicodeString digitString = cgMat->group(1, status);
4276 int32_t t = 0;
4277 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4278 if (U_SUCCESS(status)) {
4279 resultString.append(testMat->group(groupNum, status));
4280 status = U_ZERO_ERROR;
4281 }
4282 perlExpr.remove(0, cgMat->end(status));
4283 }
4284
4285 else if (perlExpr.startsWith("@-")) {
4286 int32_t i;
4287 for (i=0; i<=testMat->groupCount(); i++) {
4288 if (i>0) {
4289 resultString.append(" ");
4290 }
4291 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4292 }
4293 perlExpr.remove(0, 2);
4294 }
4295
4296 else if (perlExpr.startsWith("@+")) {
4297 int32_t i;
4298 for (i=0; i<=testMat->groupCount(); i++) {
4299 if (i>0) {
4300 resultString.append(" ");
4301 }
4302 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4303 }
4304 perlExpr.remove(0, 2);
4305 }
4306
4307 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4308 // or as an escaped sequence (e.g. \n)
4309 if (perlExpr.length() > 1) {
4310 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4311 }
4312 UChar c = perlExpr.charAt(0);
4313 switch (c) {
4314 case 'n': c = '\n'; break;
4315 // add any other escape sequences that show up in the test expected results.
4316 }
4317 resultString.append(c);
4318 perlExpr.remove(0, 1);
4319 }
4320
4321 else {
4322 // Any characters from the perl expression that we don't explicitly
4323 // recognize before here are assumed to be literals and copied
4324 // as-is to the expected results.
4325 resultString.append(perlExpr.charAt(0));
4326 perlExpr.remove(0, 1);
4327 }
4328
4329 if (U_FAILURE(status)) {
4330 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4331 break;
4332 }
4333 }
4334
4335 //
4336 // Expected Results Compare
4337 //
4338 UnicodeString expectedS(fields[4]);
4339 expectedS.findAndReplace(nulnulSrc, nulnul);
4340 expectedS.findAndReplace(ffffSrc, ffff);
4341 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4342
4343
4344 if (expectedS.compare(resultString) != 0) {
4345 err("Line %d: Incorrect perl expression results.", lineNum);
4346 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4347 }
4348
4349 delete testMat;
4350 delete testPat;
4351 }
4352
4353 //
4354 // All done. Clean up allocated stuff.
4355 //
4356 delete cgMat;
4357 delete cgPat;
4358
4359 delete groupsMat;
4360 delete groupsPat;
4361
4362 delete flagMat;
4363 delete flagPat;
4364
4365 delete lineMat;
4366 delete linePat;
4367
4368 delete fieldPat;
4369 delete [] testData;
4370
4371
4372 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4373
4374}
4375
4376
4377//-------------------------------------------------------------------------------
4378//
4379// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4380// (instead of using UnicodeStrings) to test the alternate engine.
4381// The input file for this test is re_tests, the standard regular
4382// expression test data distributed with the Perl source code.
4383// See PerlTests() for more information.
4384//
4385//-------------------------------------------------------------------------------
4386void RegexTest::PerlTestsUTF8() {
4387 char tdd[2048];
4388 const char *srcPath;
4389 UErrorCode status = U_ZERO_ERROR;
4390 UParseError pe;
4391 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4392 UText patternText = UTEXT_INITIALIZER;
4393 char *patternChars = NULL;
4394 int32_t patternLength;
4395 int32_t patternCapacity = 0;
4396 UText inputText = UTEXT_INITIALIZER;
4397 char *inputChars = NULL;
4398 int32_t inputLength;
4399 int32_t inputCapacity = 0;
4400
4401 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4402
4403 //
4404 // Open and read the test data file.
4405 //
4406 srcPath=getPath(tdd, "re_tests.txt");
4407 if(srcPath==NULL) {
4408 return; /* something went wrong, error already output */
4409 }
4410
4411 int32_t len;
4412 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4413 if (U_FAILURE(status)) {
4414 return; /* something went wrong, error already output */
4415 }
4416
4417 //
4418 // Put the test data into a UnicodeString
4419 //
4420 UnicodeString testDataString(FALSE, testData, len);
4421
4422 //
4423 // Regex to break the input file into lines, and strip the new lines.
4424 // One line per match, capture group one is the desired data.
4425 //
4426 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4427 if (U_FAILURE(status)) {
4428 dataerrln("RegexPattern::compile() error");
4429 return;
4430 }
4431 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4432
4433 //
4434 // Regex to split a test file line into fields.
4435 // There are six fields, separated by tabs.
4436 //
4437 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4438
4439 //
4440 // Regex to identify test patterns with flag settings, and to separate them.
4441 // Test patterns with flags look like 'pattern'i
4442 // Test patterns without flags are not quoted: pattern
4443 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4444 //
4445 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4446 RegexMatcher* flagMat = flagPat->matcher(status);
4447
4448 //
4449 // The Perl tests reference several perl-isms, which are evaluated/substituted
4450 // in the test data. Not being perl, this must be done explicitly. Here
4451 // are string constants and REs for these constructs.
4452 //
4453 UnicodeString nulnulSrc("${nulnul}");
4454 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4455 nulnul = nulnul.unescape();
4456
4457 UnicodeString ffffSrc("${ffff}");
4458 UnicodeString ffff("\\uffff", -1, US_INV);
4459 ffff = ffff.unescape();
4460
4461 // regexp for $-[0], $+[2], etc.
4462 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4463 RegexMatcher *groupsMat = groupsPat->matcher(status);
4464
4465 // regexp for $0, $1, $2, etc.
4466 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4467 RegexMatcher *cgMat = cgPat->matcher(status);
4468
4469
4470 //
4471 // Main Loop for the Perl Tests, runs once per line from the
4472 // test data file.
4473 //
4474 int32_t lineNum = 0;
4475 int32_t skippedUnimplementedCount = 0;
4476 while (lineMat->find()) {
4477 lineNum++;
4478
4479 //
4480 // Get a line, break it into its fields, do the Perl
4481 // variable substitutions.
4482 //
4483 UnicodeString line = lineMat->group(1, status);
4484 UnicodeString fields[7];
4485 fieldPat->split(line, fields, 7, status);
4486
4487 flagMat->reset(fields[0]);
4488 flagMat->matches(status);
4489 UnicodeString pattern = flagMat->group(2, status);
4490 pattern.findAndReplace("${bang}", "!");
4491 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4492 pattern.findAndReplace(ffffSrc, ffff);
4493
4494 //
4495 // Identify patterns that include match flag settings,
4496 // split off the flags, remove the extra quotes.
4497 //
4498 UnicodeString flagStr = flagMat->group(3, status);
4499 if (U_FAILURE(status)) {
4500 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4501 return;
4502 }
4503 int32_t flags = 0;
4504 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4505 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4506 const UChar UChar_m = 0x6d;
4507 const UChar UChar_x = 0x78;
4508 const UChar UChar_y = 0x79;
4509 if (flagStr.indexOf(UChar_i) != -1) {
4510 flags |= UREGEX_CASE_INSENSITIVE;
4511 }
4512 if (flagStr.indexOf(UChar_m) != -1) {
4513 flags |= UREGEX_MULTILINE;
4514 }
4515 if (flagStr.indexOf(UChar_x) != -1) {
4516 flags |= UREGEX_COMMENTS;
4517 }
4518
4519 //
4520 // Put the pattern in a UTF-8 UText
4521 //
4522 status = U_ZERO_ERROR;
4523 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4524 if (status == U_BUFFER_OVERFLOW_ERROR) {
4525 status = U_ZERO_ERROR;
4526 delete[] patternChars;
4527 patternCapacity = patternLength + 1;
4528 patternChars = new char[patternCapacity];
4529 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4530 }
4531 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4532
4533 //
4534 // Compile the test pattern.
4535 //
4536 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4537 if (status == U_REGEX_UNIMPLEMENTED) {
4538 //
4539 // Test of a feature that is planned for ICU, but not yet implemented.
4540 // skip the test.
4541 skippedUnimplementedCount++;
4542 delete testPat;
4543 status = U_ZERO_ERROR;
4544 continue;
4545 }
4546
4547 if (U_FAILURE(status)) {
4548 // Some tests are supposed to generate errors.
4549 // Only report an error for tests that are supposed to succeed.
4550 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4551 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4552 {
4553 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4554 }
4555 status = U_ZERO_ERROR;
4556 delete testPat;
4557 continue;
4558 }
4559
4560 if (fields[2].indexOf(UChar_i) >= 0) {
4561 // ICU should skip this test.
4562 delete testPat;
4563 continue;
4564 }
4565
4566 if (fields[2].indexOf(UChar_c) >= 0) {
4567 // This pattern should have caused a compilation error, but didn't/
4568 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4569 delete testPat;
4570 continue;
4571 }
4572
4573
4574 //
4575 // replace the Perl variables that appear in some of the
4576 // match data strings.
4577 //
4578 UnicodeString matchString = fields[1];
4579 matchString.findAndReplace(nulnulSrc, nulnul);
4580 matchString.findAndReplace(ffffSrc, ffff);
4581
4582 // Replace any \n in the match string with an actual new-line char.
4583 // Don't do full unescape, as this unescapes more than Perl does, which
4584 // causes other spurious failures in the tests.
4585 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4586
4587 //
4588 // Put the input in a UTF-8 UText
4589 //
4590 status = U_ZERO_ERROR;
4591 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4592 if (status == U_BUFFER_OVERFLOW_ERROR) {
4593 status = U_ZERO_ERROR;
4594 delete[] inputChars;
4595 inputCapacity = inputLength + 1;
4596 inputChars = new char[inputCapacity];
4597 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4598 }
4599 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4600
4601 //
4602 // Run the test, check for expected match/don't match result.
4603 //
4604 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4605 UBool found = testMat->find();
4606 UBool expected = FALSE;
4607 if (fields[2].indexOf(UChar_y) >=0) {
4608 expected = TRUE;
4609 }
4610 if (expected != found) {
4611 errln("line %d: Expected %smatch, got %smatch",
4612 lineNum, expected?"":"no ", found?"":"no " );
4613 continue;
4614 }
4615
4616 // Don't try to check expected results if there is no match.
4617 // (Some have stuff in the expected fields)
4618 if (!found) {
4619 delete testMat;
4620 delete testPat;
4621 continue;
4622 }
4623
4624 //
4625 // Interpret the Perl expression from the fourth field of the data file,
4626 // building up an ICU string from the results of the ICU match.
4627 // The Perl expression will contain references to the results of
4628 // a regex match, including the matched string, capture group strings,
4629 // group starting and ending indicies, etc.
4630 //
4631 UnicodeString resultString;
4632 UnicodeString perlExpr = fields[3];
4633
4634 while (perlExpr.length() > 0) {
4635 groupsMat->reset(perlExpr);
4636 cgMat->reset(perlExpr);
4637
4638 if (perlExpr.startsWith("$&")) {
4639 resultString.append(testMat->group(status));
4640 perlExpr.remove(0, 2);
4641 }
4642
4643 else if (groupsMat->lookingAt(status)) {
4644 // $-[0] $+[2] etc.
4645 UnicodeString digitString = groupsMat->group(2, status);
4646 int32_t t = 0;
4647 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4648 UnicodeString plusOrMinus = groupsMat->group(1, status);
4649 int32_t matchPosition;
4650 if (plusOrMinus.compare("+") == 0) {
4651 matchPosition = testMat->end(groupNum, status);
4652 } else {
4653 matchPosition = testMat->start(groupNum, status);
4654 }
4655 if (matchPosition != -1) {
4656 ICU_Utility::appendNumber(resultString, matchPosition);
4657 }
4658 perlExpr.remove(0, groupsMat->end(status));
4659 }
4660
4661 else if (cgMat->lookingAt(status)) {
4662 // $1, $2, $3, etc.
4663 UnicodeString digitString = cgMat->group(1, status);
4664 int32_t t = 0;
4665 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4666 if (U_SUCCESS(status)) {
4667 resultString.append(testMat->group(groupNum, status));
4668 status = U_ZERO_ERROR;
4669 }
4670 perlExpr.remove(0, cgMat->end(status));
4671 }
4672
4673 else if (perlExpr.startsWith("@-")) {
4674 int32_t i;
4675 for (i=0; i<=testMat->groupCount(); i++) {
4676 if (i>0) {
4677 resultString.append(" ");
4678 }
4679 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4680 }
4681 perlExpr.remove(0, 2);
4682 }
4683
4684 else if (perlExpr.startsWith("@+")) {
4685 int32_t i;
4686 for (i=0; i<=testMat->groupCount(); i++) {
4687 if (i>0) {
4688 resultString.append(" ");
4689 }
4690 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4691 }
4692 perlExpr.remove(0, 2);
4693 }
4694
4695 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4696 // or as an escaped sequence (e.g. \n)
4697 if (perlExpr.length() > 1) {
4698 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4699 }
4700 UChar c = perlExpr.charAt(0);
4701 switch (c) {
4702 case 'n': c = '\n'; break;
4703 // add any other escape sequences that show up in the test expected results.
4704 }
4705 resultString.append(c);
4706 perlExpr.remove(0, 1);
4707 }
4708
4709 else {
4710 // Any characters from the perl expression that we don't explicitly
4711 // recognize before here are assumed to be literals and copied
4712 // as-is to the expected results.
4713 resultString.append(perlExpr.charAt(0));
4714 perlExpr.remove(0, 1);
4715 }
4716
4717 if (U_FAILURE(status)) {
4718 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4719 break;
4720 }
4721 }
4722
4723 //
4724 // Expected Results Compare
4725 //
4726 UnicodeString expectedS(fields[4]);
4727 expectedS.findAndReplace(nulnulSrc, nulnul);
4728 expectedS.findAndReplace(ffffSrc, ffff);
4729 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4730
4731
4732 if (expectedS.compare(resultString) != 0) {
4733 err("Line %d: Incorrect perl expression results.", lineNum);
4734 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4735 }
4736
4737 delete testMat;
4738 delete testPat;
4739 }
4740
4741 //
4742 // All done. Clean up allocated stuff.
4743 //
4744 delete cgMat;
4745 delete cgPat;
4746
4747 delete groupsMat;
4748 delete groupsPat;
4749
4750 delete flagMat;
4751 delete flagPat;
4752
4753 delete lineMat;
4754 delete linePat;
4755
4756 delete fieldPat;
4757 delete [] testData;
4758
4759 utext_close(&patternText);
4760 utext_close(&inputText);
4761
4762 delete [] patternChars;
4763 delete [] inputChars;
4764
4765
4766 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4767
4768}
4769
4770
4771//--------------------------------------------------------------
4772//
4773// Bug6149 Verify limits to heap expansion for backtrack stack.
4774// Use this pattern,
4775// "(a?){1,8000000}"
4776// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4777// This test is likely to be fragile, as further optimizations stop
4778// more cases of pointless looping in the match engine.
4779//
4780//---------------------------------------------------------------
4781void RegexTest::Bug6149() {
4782 UnicodeString pattern("(a?){1,8000000}");
4783 UnicodeString s("xyz");
4784 uint32_t flags = 0;
4785 UErrorCode status = U_ZERO_ERROR;
4786
4787 RegexMatcher matcher(pattern, s, flags, status);
4788 UBool result = false;
4789 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4790 REGEX_ASSERT(result == FALSE);
4791 }
4792
4793
4794//
4795// Callbacks() Test the callback function.
4796// When set, callbacks occur periodically during matching operations,
4797// giving the application code the ability to abort the operation
4798// before it's normal completion.
4799//
4800
4801struct callBackContext {
4802 RegexTest *test;
4803 int32_t maxCalls;
4804 int32_t numCalls;
4805 int32_t lastSteps;
4806 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4807};
4808
4809U_CDECL_BEGIN
4810static UBool U_CALLCONV
4811testCallBackFn(const void *context, int32_t steps) {
4812 callBackContext *info = (callBackContext *)context;
4813 if (info->lastSteps+1 != steps) {
4814 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4815 }
4816 info->lastSteps = steps;
4817 info->numCalls++;
4818 return (info->numCalls < info->maxCalls);
4819}
4820U_CDECL_END
4821
4822void RegexTest::Callbacks() {
4823 {
4824 // Getter returns NULLs if no callback has been set
4825
4826 // The variables that the getter will fill in.
4827 // Init to non-null values so that the action of the getter can be seen.
4828 const void *returnedContext = &returnedContext;
4829 URegexMatchCallback *returnedFn = &testCallBackFn;
4830
4831 UErrorCode status = U_ZERO_ERROR;
4832 RegexMatcher matcher("x", 0, status);
4833 REGEX_CHECK_STATUS;
4834 matcher.getMatchCallback(returnedFn, returnedContext, status);
4835 REGEX_CHECK_STATUS;
4836 REGEX_ASSERT(returnedFn == NULL);
4837 REGEX_ASSERT(returnedContext == NULL);
4838 }
4839
4840 {
4841 // Set and Get work
4842 callBackContext cbInfo = {this, 0, 0, 0};
4843 const void *returnedContext;
4844 URegexMatchCallback *returnedFn;
4845 UErrorCode status = U_ZERO_ERROR;
4846 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4847 REGEX_CHECK_STATUS;
4848 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4849 REGEX_CHECK_STATUS;
4850 matcher.getMatchCallback(returnedFn, returnedContext, status);
4851 REGEX_CHECK_STATUS;
4852 REGEX_ASSERT(returnedFn == testCallBackFn);
4853 REGEX_ASSERT(returnedContext == &cbInfo);
4854
4855 // A short-running match shouldn't invoke the callback
4856 status = U_ZERO_ERROR;
4857 cbInfo.reset(1);
4858 UnicodeString s = "xxx";
4859 matcher.reset(s);
4860 REGEX_ASSERT(matcher.matches(status));
4861 REGEX_CHECK_STATUS;
4862 REGEX_ASSERT(cbInfo.numCalls == 0);
4863
4864 // A medium-length match that runs long enough to invoke the
4865 // callback, but not so long that the callback aborts it.
4866 status = U_ZERO_ERROR;
4867 cbInfo.reset(4);
4868 s = "aaaaaaaaaaaaaaaaaaab";
4869 matcher.reset(s);
4870 REGEX_ASSERT(matcher.matches(status)==FALSE);
4871 REGEX_CHECK_STATUS;
4872 REGEX_ASSERT(cbInfo.numCalls > 0);
4873
4874 // A longer running match that the callback function will abort.
4875 status = U_ZERO_ERROR;
4876 cbInfo.reset(4);
4877 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4878 matcher.reset(s);
4879 REGEX_ASSERT(matcher.matches(status)==FALSE);
4880 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4881 REGEX_ASSERT(cbInfo.numCalls == 4);
4882
4883 // A longer running find that the callback function will abort.
4884 status = U_ZERO_ERROR;
4885 cbInfo.reset(4);
4886 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4887 matcher.reset(s);
4888 REGEX_ASSERT(matcher.find(status)==FALSE);
4889 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4890 REGEX_ASSERT(cbInfo.numCalls == 4);
4891 }
4892
4893
4894}
4895
4896
4897//
4898// FindProgressCallbacks() Test the find "progress" callback function.
4899// When set, the find progress callback will be invoked during a find operations
4900// after each return from a match attempt, giving the application the opportunity
4901// to terminate a long-running find operation before it's normal completion.
4902//
4903
4904struct progressCallBackContext {
4905 RegexTest *test;
4906 int64_t lastIndex;
4907 int32_t maxCalls;
4908 int32_t numCalls;
4909 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4910};
4911
4912// call-back function for find().
4913// Return TRUE to continue the find().
4914// Return FALSE to stop the find().
4915U_CDECL_BEGIN
4916static UBool U_CALLCONV
4917testProgressCallBackFn(const void *context, int64_t matchIndex) {
4918 progressCallBackContext *info = (progressCallBackContext *)context;
4919 info->numCalls++;
4920 info->lastIndex = matchIndex;
4921// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4922 return (info->numCalls < info->maxCalls);
4923}
4924U_CDECL_END
4925
4926void RegexTest::FindProgressCallbacks() {
4927 {
4928 // Getter returns NULLs if no callback has been set
4929
4930 // The variables that the getter will fill in.
4931 // Init to non-null values so that the action of the getter can be seen.
4932 const void *returnedContext = &returnedContext;
4933 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4934
4935 UErrorCode status = U_ZERO_ERROR;
4936 RegexMatcher matcher("x", 0, status);
4937 REGEX_CHECK_STATUS;
4938 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4939 REGEX_CHECK_STATUS;
4940 REGEX_ASSERT(returnedFn == NULL);
4941 REGEX_ASSERT(returnedContext == NULL);
4942 }
4943
4944 {
4945 // Set and Get work
4946 progressCallBackContext cbInfo = {this, 0, 0, 0};
4947 const void *returnedContext;
4948 URegexFindProgressCallback *returnedFn;
4949 UErrorCode status = U_ZERO_ERROR;
4950 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4951 REGEX_CHECK_STATUS;
4952 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4953 REGEX_CHECK_STATUS;
4954 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4955 REGEX_CHECK_STATUS;
4956 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4957 REGEX_ASSERT(returnedContext == &cbInfo);
4958
4959 // A find that matches on the initial position does NOT invoke the callback.
4960 status = U_ZERO_ERROR;
4961 cbInfo.reset(100);
4962 UnicodeString s = "aaxxx";
4963 matcher.reset(s);
4964#if 0
4965 matcher.setTrace(TRUE);
4966#endif
4967 REGEX_ASSERT(matcher.find(0, status));
4968 REGEX_CHECK_STATUS;
4969 REGEX_ASSERT(cbInfo.numCalls == 0);
4970
4971 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4972 // but not so many times that we interrupt the operation.
4973 status = U_ZERO_ERROR;
4974 s = "aaaaaaaaaaaaaaaaaaab";
4975 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4976 matcher.reset(s);
4977 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4978 REGEX_CHECK_STATUS;
4979 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4980
4981 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4982 status = U_ZERO_ERROR;
4983 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4984 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4985 matcher.reset(s1);
4986 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4987 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4988 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4989
4990 // Now a match that will succeed, but after an interruption
4991 status = U_ZERO_ERROR;
4992 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4993 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4994 matcher.reset(s2);
4995 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4996 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4997 // Now retry the match from where left off
4998 cbInfo.maxCalls = 100; // No callback limit
4999 status = U_ZERO_ERROR;
5000 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5001 REGEX_CHECK_STATUS;
5002 }
5003
5004
5005}
5006
5007
5008//---------------------------------------------------------------------------
5009//
5010// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5011// UTexts. The pure-C implementation of UText
5012// has no mutable backing stores, but we can
5013// use UnicodeString here to test the functionality.
5014//
5015//---------------------------------------------------------------------------
5016void RegexTest::PreAllocatedUTextCAPI () {
5017 UErrorCode status = U_ZERO_ERROR;
5018 URegularExpression *re;
5019 UText patternText = UTEXT_INITIALIZER;
5020 UnicodeString buffer;
5021 UText bufferText = UTEXT_INITIALIZER;
5022
5023 utext_openUnicodeString(&bufferText, &buffer, &status);
5024
5025 /*
5026 * getText() and getUText()
5027 */
5028 {
5029 UText text1 = UTEXT_INITIALIZER;
5030 UText text2 = UTEXT_INITIALIZER;
5031 UChar text2Chars[20];
5032 UText *resultText;
5033
5034 status = U_ZERO_ERROR;
5035 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5036 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5037 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5038 utext_openUChars(&text2, text2Chars, -1, &status);
5039
5040 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5041 re = uregex_openUText(&patternText, 0, NULL, &status);
5042
5043 /* First set a UText */
5044 uregex_setUText(re, &text1, &status);
5045 resultText = uregex_getUText(re, &bufferText, &status);
5046 REGEX_CHECK_STATUS;
5047 REGEX_ASSERT(resultText == &bufferText);
5048 utext_setNativeIndex(resultText, 0);
5049 utext_setNativeIndex(&text1, 0);
5050 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5051
5052 resultText = uregex_getUText(re, &bufferText, &status);
5053 REGEX_CHECK_STATUS;
5054 REGEX_ASSERT(resultText == &bufferText);
5055 utext_setNativeIndex(resultText, 0);
5056 utext_setNativeIndex(&text1, 0);
5057 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5058
5059 /* Then set a UChar * */
5060 uregex_setText(re, text2Chars, 7, &status);
5061 resultText = uregex_getUText(re, &bufferText, &status);
5062 REGEX_CHECK_STATUS;
5063 REGEX_ASSERT(resultText == &bufferText);
5064 utext_setNativeIndex(resultText, 0);
5065 utext_setNativeIndex(&text2, 0);
5066 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5067
5068 uregex_close(re);
5069 utext_close(&text1);
5070 utext_close(&text2);
5071 }
5072
5073 /*
5074 * group()
5075 */
5076 {
5077 UChar text1[80];
5078 UText *actual;
5079 UBool result;
5080 int64_t length = 0;
5081
5082 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5083 // 012345678901234567890123456789012345678901234567
5084 // 0 1 2 3 4
5085
5086 status = U_ZERO_ERROR;
5087 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5088 REGEX_CHECK_STATUS;
5089
5090 uregex_setText(re, text1, -1, &status);
5091 result = uregex_find(re, 0, &status);
5092 REGEX_ASSERT(result==TRUE);
5093
5094 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5095 status = U_ZERO_ERROR;
5096 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5097 REGEX_CHECK_STATUS;
5098 REGEX_ASSERT(actual == &bufferText);
5099 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5100 REGEX_ASSERT(length == 16);
5101 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5102
5103 /* Capture group #1. Should succeed, matching " interior ". */
5104 status = U_ZERO_ERROR;
5105 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5106 REGEX_CHECK_STATUS;
5107 REGEX_ASSERT(actual == &bufferText);
5108 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5109 REGEX_ASSERT(length == 10);
5110 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5111
5112 /* Capture group out of range. Error. */
5113 status = U_ZERO_ERROR;
5114 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5115 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5116 REGEX_ASSERT(actual == &bufferText);
5117 uregex_close(re);
5118
5119 }
5120
5121 /*
5122 * replaceFirst()
5123 */
5124 {
5125 UChar text1[80];
5126 UChar text2[80];
5127 UText replText = UTEXT_INITIALIZER;
5128 UText *result;
5129 status = U_ZERO_ERROR;
5130 utext_openUnicodeString(&bufferText, &buffer, &status);
5131
5132 status = U_ZERO_ERROR;
5133 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5134 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5135 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5136
5137 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5138 REGEX_CHECK_STATUS;
5139
5140 /* Normal case, with match */
5141 uregex_setText(re, text1, -1, &status);
5142 REGEX_CHECK_STATUS;
5143 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5144 REGEX_CHECK_STATUS;
5145 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5146 REGEX_CHECK_STATUS;
5147 REGEX_ASSERT(result == &bufferText);
5148 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5149
5150 /* No match. Text should copy to output with no changes. */
5151 uregex_setText(re, text2, -1, &status);
5152 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5153 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5154 REGEX_CHECK_STATUS;
5155 REGEX_ASSERT(result == &bufferText);
5156 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5157
5158 /* Unicode escapes */
5159 uregex_setText(re, text1, -1, &status);
5160 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5161 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5162 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5163 REGEX_CHECK_STATUS;
5164 REGEX_ASSERT(result == &bufferText);
5165 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5166
5167 uregex_close(re);
5168 utext_close(&replText);
5169 }
5170
5171
5172 /*
5173 * replaceAll()
5174 */
5175 {
5176 UChar text1[80];
5177 UChar text2[80];
5178 UText replText = UTEXT_INITIALIZER;
5179 UText *result;
5180
5181 status = U_ZERO_ERROR;
5182 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5183 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5184 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5185
5186 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5187 REGEX_CHECK_STATUS;
5188
5189 /* Normal case, with match */
5190 uregex_setText(re, text1, -1, &status);
5191 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5192 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5193 REGEX_CHECK_STATUS;
5194 REGEX_ASSERT(result == &bufferText);
5195 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5196
5197 /* No match. Text should copy to output with no changes. */
5198 uregex_setText(re, text2, -1, &status);
5199 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5200 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5201 REGEX_CHECK_STATUS;
5202 REGEX_ASSERT(result == &bufferText);
5203 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5204
5205 uregex_close(re);
5206 utext_close(&replText);
5207 }
5208
5209
5210 /*
5211 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5212 * so we don't need to test it here.
5213 */
5214
5215 utext_close(&bufferText);
5216 utext_close(&patternText);
5217}
5218
5219
5220//--------------------------------------------------------------
5221//
5222// NamedCapture Check basic named capture group functionality
5223//
5224//--------------------------------------------------------------
5225void RegexTest::NamedCapture() {
5226 UErrorCode status = U_ZERO_ERROR;
5227 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5228 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5229 REGEX_CHECK_STATUS;
5230 int32_t group = pat->groupNumberFromName("five", -1, status);
5231 REGEX_CHECK_STATUS;
5232 REGEX_ASSERT(5 == group);
5233 group = pat->groupNumberFromName("three", -1, status);
5234 REGEX_CHECK_STATUS;
5235 REGEX_ASSERT(3 == group);
5236
5237 status = U_ZERO_ERROR;
5238 group = pat->groupNumberFromName(UnicodeString("six"), status);
5239 REGEX_CHECK_STATUS;
5240 REGEX_ASSERT(6 == group);
5241
5242 status = U_ZERO_ERROR;
5243 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5244 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5245
5246 status = U_ZERO_ERROR;
5247
5248 // After copying a pattern, named capture should still work in the copy.
5249 RegexPattern *copiedPat = new RegexPattern(*pat);
5250 REGEX_ASSERT(*copiedPat == *pat);
5251 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5252
5253 group = copiedPat->groupNumberFromName("five", -1, status);
5254 REGEX_CHECK_STATUS;
5255 REGEX_ASSERT(5 == group);
5256 group = copiedPat->groupNumberFromName("three", -1, status);
5257 REGEX_CHECK_STATUS;
5258 REGEX_ASSERT(3 == group);
5259 delete copiedPat;
5260
5261 // ReplaceAll with named capture group.
5262 status = U_ZERO_ERROR;
5263 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5264 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5265 REGEX_CHECK_STATUS;
5266 // m.pattern().dumpPattern();
5267 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5268 REGEX_CHECK_STATUS;
5269 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5270 delete m;
5271
5272 // ReplaceAll, allowed capture group numbers.
5273 text = UnicodeString("abcmxyz");
5274 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5275 REGEX_CHECK_STATUS;
5276
5277 status = U_ZERO_ERROR;
5278 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5279 REGEX_CHECK_STATUS;
5280 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5281
5282 status = U_ZERO_ERROR;
5283 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5284 REGEX_CHECK_STATUS;
5285 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5286
5287 status = U_ZERO_ERROR;
5288 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5289 REGEX_CHECK_STATUS;
5290 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5291
5292 status = U_ZERO_ERROR;
5293 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5294 REGEX_CHECK_STATUS;
5295 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5296
5297 status = U_ZERO_ERROR;
5298 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5299 REGEX_CHECK_STATUS;
5300 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5301
5302 status = U_ZERO_ERROR;
5303 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5304 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5305
5306 status = U_ZERO_ERROR;
5307 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5308 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5309 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5310
5311 status = U_ZERO_ERROR;
5312 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5313 REGEX_CHECK_STATUS; // that push group num out of range.
5314 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5315
5316 status = U_ZERO_ERROR;
5317 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5318 REGEX_CHECK_STATUS;
5319 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5320
5321 status = U_ZERO_ERROR;
5322 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5323 REGEX_CHECK_STATUS;
5324 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5325
5326 status = U_ZERO_ERROR;
5327 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5328 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5329
5330 status = U_ZERO_ERROR;
5331 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5332 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5333
5334 status = U_ZERO_ERROR;
5335 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5336 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5337
5338 status = U_ZERO_ERROR;
5339 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5340 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5341
5342 delete m;
5343
5344 // Repeat the above replaceAll() tests using the plain C API, which
5345 // has a separate implementation internally.
5346 // TODO: factor out the test data.
5347
5348 status = U_ZERO_ERROR;
5349 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5350 REGEX_CHECK_STATUS;
5351 text = UnicodeString("abcmxyz");
5352 uregex_setText(re, text.getBuffer(), text.length(), &status);
5353 REGEX_CHECK_STATUS;
5354
5355 UChar resultBuf[100];
5356 int32_t resultLength;
5357 UnicodeString repl;
5358
5359 status = U_ZERO_ERROR;
5360 repl = UnicodeString("<$0>");
5361 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5362 REGEX_CHECK_STATUS;
5363 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5364
5365 status = U_ZERO_ERROR;
5366 repl = UnicodeString("<$1>");
5367 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5368 REGEX_CHECK_STATUS;
5369 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5370
5371 status = U_ZERO_ERROR;
5372 repl = UnicodeString("<${one}>");
5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374 REGEX_CHECK_STATUS;
5375 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5376
5377 status = U_ZERO_ERROR;
5378 repl = UnicodeString("<$2>");
5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380 REGEX_CHECK_STATUS;
5381 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5382
5383 status = U_ZERO_ERROR;
5384 repl = UnicodeString("<$3>");
5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386 REGEX_CHECK_STATUS;
5387 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5388
5389 status = U_ZERO_ERROR;
5390 repl = UnicodeString("<$4>");
5391 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5392 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5393
5394 status = U_ZERO_ERROR;
5395 repl = UnicodeString("<$04>");
5396 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5397 REGEX_CHECK_STATUS;
5398 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5399
5400 status = U_ZERO_ERROR;
5401 repl = UnicodeString("<$000016>");
5402 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5403 REGEX_CHECK_STATUS;
5404 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5405
5406 status = U_ZERO_ERROR;
5407 repl = UnicodeString("<$3$2$1${one}>");
5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409 REGEX_CHECK_STATUS;
5410 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5411
5412 status = U_ZERO_ERROR;
5413 repl = UnicodeString("$3$2$1${one}");
5414 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5415 REGEX_CHECK_STATUS;
5416 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5417
5418 status = U_ZERO_ERROR;
5419 repl = UnicodeString("<${noSuchName}>");
5420 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5421 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5422
5423 status = U_ZERO_ERROR;
5424 repl = UnicodeString("<${invalid-name}>");
5425 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5426 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5427
5428 status = U_ZERO_ERROR;
5429 repl = UnicodeString("<${one");
5430 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5431 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5432
5433 status = U_ZERO_ERROR;
5434 repl = UnicodeString("$not a capture group");
5435 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5436 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5437
5438 uregex_close(re);
5439}
5440
5441//--------------------------------------------------------------
5442//
5443// NamedCaptureLimits Patterns with huge numbers of named capture groups.
5444// The point is not so much what the exact limit is,
5445// but that a largish number doesn't hit bad non-linear performance,
5446// and that exceeding the limit fails cleanly.
5447//
5448//--------------------------------------------------------------
5449void RegexTest::NamedCaptureLimits() {
5450 if (quick) {
5451 logln("Skipping test. Runs in exhuastive mode only.");
5452 return;
5453 }
5454 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5455 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5456 char nnbuf[100];
5457 UnicodeString pattern;
5458 int32_t nn;
5459
5460 for (nn=1; nn<goodLimit; nn++) {
5461 sprintf(nnbuf, "(?<nn%d>)", nn);
5462 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5463 }
5464 UErrorCode status = U_ZERO_ERROR;
5465 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5466 REGEX_CHECK_STATUS;
5467 for (nn=1; nn<goodLimit; nn++) {
5468 sprintf(nnbuf, "nn%d", nn);
5469 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5470 REGEX_ASSERT(nn == groupNum);
5471 if (nn != groupNum) {
5472 break;
5473 }
5474 }
5475 delete pat;
5476
5477 pattern.remove();
5478 for (nn=1; nn<failLimit; nn++) {
5479 sprintf(nnbuf, "(?<nn%d>)", nn);
5480 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5481 }
5482 status = U_ZERO_ERROR;
5483 pat = RegexPattern::compile(pattern, 0, status);
5484 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5485 delete pat;
5486}
5487
5488
5489//--------------------------------------------------------------
5490//
5491// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5492//
5493//---------------------------------------------------------------
5494void RegexTest::Bug7651() {
5495 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5496 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5497 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5498 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5499 UnicodeString s("#ff @abcd This is test");
5500 RegexPattern *REPattern = NULL;
5501 RegexMatcher *REMatcher = NULL;
5502 UErrorCode status = U_ZERO_ERROR;
5503 UParseError pe;
5504
5505 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5506 REGEX_CHECK_STATUS;
5507 REMatcher = REPattern->matcher(s, status);
5508 REGEX_CHECK_STATUS;
5509 REGEX_ASSERT(REMatcher->find());
5510 REGEX_ASSERT(REMatcher->start(status) == 0);
5511 delete REPattern;
5512 delete REMatcher;
5513 status = U_ZERO_ERROR;
5514
5515 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5516 REGEX_CHECK_STATUS;
5517 REMatcher = REPattern->matcher(s, status);
5518 REGEX_CHECK_STATUS;
5519 REGEX_ASSERT(REMatcher->find());
5520 REGEX_ASSERT(REMatcher->start(status) == 0);
5521 delete REPattern;
5522 delete REMatcher;
5523 status = U_ZERO_ERROR;
5524 }
5525
5526void RegexTest::Bug7740() {
5527 UErrorCode status = U_ZERO_ERROR;
5528 UnicodeString pattern = "(a)";
5529 UnicodeString text = "abcdef";
5530 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5531 REGEX_CHECK_STATUS;
5532 REGEX_ASSERT(m->lookingAt(status));
5533 REGEX_CHECK_STATUS;
5534 status = U_ILLEGAL_ARGUMENT_ERROR;
5535 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5536 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5537 REGEX_ASSERT(s == "");
5538 delete m;
5539}
5540
5541// Bug 8479: was crashing whith a Bogus UnicodeString as input.
5542
5543void RegexTest::Bug8479() {
5544 UErrorCode status = U_ZERO_ERROR;
5545
5546 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5547 REGEX_CHECK_STATUS;
5548 if (U_SUCCESS(status))
5549 {
5550 UnicodeString str;
5551 str.setToBogus();
5552 pMatcher->reset(str);
5553 status = U_ZERO_ERROR;
5554 pMatcher->matches(status);
5555 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5556 delete pMatcher;
5557 }
5558}
5559
5560
5561// Bug 7029
5562void RegexTest::Bug7029() {
5563 UErrorCode status = U_ZERO_ERROR;
5564
5565 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5566 UnicodeString text = "abc.def";
5567 UnicodeString splits[10];
5568 REGEX_CHECK_STATUS;
5569 int32_t numFields = pMatcher->split(text, splits, 10, status);
5570 REGEX_CHECK_STATUS;
5571 REGEX_ASSERT(numFields == 8);
5572 delete pMatcher;
5573}
5574
5575// Bug 9283
5576// This test is checking for the existance of any supplemental characters that case-fold
5577// to a bmp character.
5578//
5579// At the time of this writing there are none. If any should appear in a subsequent release
5580// of Unicode, the code in regular expressions compilation that determines the longest
5581// posssible match for a literal string will need to be enhanced.
5582//
5583// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5584// for details on what to do in case of a failure of this test.
5585//
5586void RegexTest::Bug9283() {
5587#if !UCONFIG_NO_NORMALIZATION
5588 UErrorCode status = U_ZERO_ERROR;
5589 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5590 REGEX_CHECK_STATUS;
5591 int32_t index;
5592 UChar32 c;
5593 for (index=0; ; index++) {
5594 c = supplementalsWithCaseFolding.charAt(index);
5595 if (c == -1) {
5596 break;
5597 }
5598 UnicodeString cf = UnicodeString(c).foldCase();
5599 REGEX_ASSERT(cf.length() >= 2);
5600 }
5601#endif /* #if !UCONFIG_NO_NORMALIZATION */
5602}
5603
5604
5605void RegexTest::CheckInvBufSize() {
5606 if(inv_next>=INV_BUFSIZ) {
5607 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5608 __FILE__, INV_BUFSIZ, inv_next);
5609 } else {
5610 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5611 }
5612}
5613
5614
5615void RegexTest::Bug10459() {
5616 UErrorCode status = U_ZERO_ERROR;
5617 UnicodeString patternString("(txt)");
5618 UnicodeString txtString("txt");
5619
5620 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5621 REGEX_CHECK_STATUS;
5622 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5623 REGEX_CHECK_STATUS;
5624
5625 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5626 REGEX_CHECK_STATUS;
5627
5628 uregex_setUText(icu_re, utext_txt, &status);
5629 REGEX_CHECK_STATUS;
5630
5631 // The bug was that calling uregex_group() before doing a matching operation
5632 // was causing a segfault. Only for Regular Expressions created from UText.
5633 // It should set an U_REGEX_INVALID_STATE.
5634
5635 UChar buf[100];
5636 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5637 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5638 REGEX_ASSERT(len == 0);
5639
5640 uregex_close(icu_re);
5641 utext_close(utext_pat);
5642 utext_close(utext_txt);
5643}
5644
5645void RegexTest::TestCaseInsensitiveStarters() {
5646 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5647 // become stale because of new Unicode characters.
5648 // If it is stale, rerun the generation tool
5649 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5650 // and replace the embedded data in i18n/regexcmp.cpp
5651
5652 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5653 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5654 continue;
5655 }
5656 UnicodeSet s(cp, cp);
5657 s.closeOver(USET_CASE_INSENSITIVE);
5658 UnicodeSetIterator setIter(s);
5659 while (setIter.next()) {
5660 if (!setIter.isString()) {
5661 continue;
5662 }
5663 const UnicodeString &str = setIter.getString();
5664 UChar32 firstChar = str.char32At(0);
5665 UnicodeSet starters;
5666 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5667 if (!starters.contains(cp)) {
5668 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5669 return;
5670 }
5671 }
5672 }
5673}
5674
5675
5676void RegexTest::TestBug11049() {
5677 // Original bug report: pattern with match start consisting of one of several individual characters,
5678 // and the text being matched ending with a supplementary character. find() would read past the
5679 // end of the input text when searching for potential match starting points.
5680
5681 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5682 // detect the bad read.
5683
5684 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5685 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5686
5687 // Test again with a pattern starting with a single character,
5688 // which takes a different code path than starting with an OR expression,
5689 // but with similar logic.
5690 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5691 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5692}
5693
5694// Run a single test case from TestBug11049(). Internal function.
5695void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5696 UErrorCode status = U_ZERO_ERROR;
5697 UnicodeString patternString = UnicodeString(pattern).unescape();
5698 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5699
5700 UnicodeString dataString = UnicodeString(data).unescape();
5701 UChar *exactBuffer = new UChar[dataString.length()];
5702 dataString.extract(exactBuffer, dataString.length(), status);
5703 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5704
5705 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5706 REGEX_CHECK_STATUS;
5707 matcher->reset(ut);
5708 UBool result = matcher->find();
5709 if (result != expectMatch) {
5710 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5711 __FILE__, lineNumber, expectMatch, result, pattern, data);
5712 }
5713
5714 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5715 // off-by-one on find() with match at the last code point.
5716 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5717 // because string.unescape() will only shrink it.
5718 char * utf8Buffer = new char[uprv_strlen(data)+1];
5719 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5720 REGEX_CHECK_STATUS;
5721 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5722 REGEX_CHECK_STATUS;
5723 matcher->reset(ut);
5724 result = matcher->find();
5725 if (result != expectMatch) {
5726 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5727 __FILE__, lineNumber, expectMatch, result, pattern, data);
5728 }
5729 delete [] utf8Buffer;
5730
5731 utext_close(ut);
5732 delete [] exactBuffer;
5733}
5734
5735
5736void RegexTest::TestBug11371() {
5737 if (quick) {
5738 logln("Skipping test. Runs in exhuastive mode only.");
5739 return;
5740 }
5741 UErrorCode status = U_ZERO_ERROR;
5742 UnicodeString patternString;
5743
5744 for (int i=0; i<8000000; i++) {
5745 patternString.append(UnicodeString("()"));
5746 }
5747 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5748 if (status != U_REGEX_PATTERN_TOO_BIG) {
5749 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750 __FILE__, __LINE__, u_errorName(status));
5751 }
5752
5753 status = U_ZERO_ERROR;
5754 patternString = "(";
5755 for (int i=0; i<20000000; i++) {
5756 patternString.append(UnicodeString("A++"));
5757 }
5758 patternString.append(UnicodeString("){0}B++"));
5759 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5760 if (status != U_REGEX_PATTERN_TOO_BIG) {
5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762 __FILE__, __LINE__, u_errorName(status));
5763 }
5764
5765 // Pattern with too much string data, such that string indexes overflow operand data field size
5766 // in compiled instruction.
5767 status = U_ZERO_ERROR;
5768 patternString = "";
5769 while (patternString.length() < 0x00ffffff) {
5770 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5771 }
5772 patternString.append(UnicodeString("X? trailing string"));
5773 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5774 if (status != U_REGEX_PATTERN_TOO_BIG) {
5775 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5776 __FILE__, __LINE__, u_errorName(status));
5777 }
5778}
5779
5780void RegexTest::TestBug11480() {
5781 // C API, get capture group of a group that does not participate in the match.
5782 // (Returns a zero length string, with nul termination,
5783 // indistinguishable from a group with a zero lenght match.)
5784
5785 UErrorCode status = U_ZERO_ERROR;
5786 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5787 REGEX_CHECK_STATUS;
5788 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5789 uregex_setText(re, text.getBuffer(), text.length(), &status);
5790 REGEX_CHECK_STATUS;
5791 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5792 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5793 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5794 REGEX_ASSERT(length == 0);
5795 REGEX_ASSERT(buf[0] == 13);
5796 REGEX_ASSERT(buf[1] == 0);
5797 REGEX_ASSERT(buf[2] == 13);
5798 uregex_close(re);
5799}
5800
5801
5802#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */