]> git.saurik.com Git - apple/icu.git/blame_incremental - icuSources/test/intltest/regextst.cpp
ICU-66108.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
... / ...
CommitLineData
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9//
10// regextst.cpp
11//
12// ICU Regular Expressions test, part of intltest.
13//
14
15/*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25#include "intltest.h"
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31
32#include "unicode/localpointer.h"
33#include "unicode/regex.h"
34#include "unicode/uchar.h"
35#include "unicode/ucnv.h"
36#include "unicode/uniset.h"
37#include "unicode/uregex.h"
38#include "unicode/usetiter.h"
39#include "unicode/ustring.h"
40#include "unicode/utext.h"
41#include "unicode/utf16.h"
42#include "cstr.h"
43#include "regextst.h"
44#include "regexcmp.h"
45#include "uvector.h"
46#include "util.h"
47#include "cmemory.h"
48#include "cstring.h"
49#include "uinvchar.h"
50
51#define SUPPORT_MUTATING_INPUT_STRING 0
52
53//---------------------------------------------------------------------------
54//
55// Test class boilerplate
56//
57//---------------------------------------------------------------------------
58RegexTest::RegexTest()
59{
60}
61
62
63RegexTest::~RegexTest()
64{
65}
66
67
68
69void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70{
71 if (exec) logln("TestSuite RegexTest: ");
72 TESTCASE_AUTO_BEGIN;
73 TESTCASE_AUTO(Basic);
74 TESTCASE_AUTO(API_Match);
75 TESTCASE_AUTO(API_Replace);
76 TESTCASE_AUTO(API_Pattern);
77#if !UCONFIG_NO_FILE_IO
78 TESTCASE_AUTO(Extended);
79#endif
80 TESTCASE_AUTO(Errors);
81 TESTCASE_AUTO(PerlTests);
82 TESTCASE_AUTO(Callbacks);
83 TESTCASE_AUTO(FindProgressCallbacks);
84 TESTCASE_AUTO(Bug6149);
85 TESTCASE_AUTO(UTextBasic);
86 TESTCASE_AUTO(API_Match_UTF8);
87 TESTCASE_AUTO(API_Replace_UTF8);
88 TESTCASE_AUTO(API_Pattern_UTF8);
89 TESTCASE_AUTO(PerlTestsUTF8);
90 TESTCASE_AUTO(PreAllocatedUTextCAPI);
91 TESTCASE_AUTO(Bug7651);
92 TESTCASE_AUTO(Bug7740);
93 TESTCASE_AUTO(Bug8479);
94 TESTCASE_AUTO(Bug7029);
95 TESTCASE_AUTO(CheckInvBufSize);
96 TESTCASE_AUTO(Bug9283);
97 TESTCASE_AUTO(Bug10459);
98 TESTCASE_AUTO(TestCaseInsensitiveStarters);
99 TESTCASE_AUTO(TestBug11049);
100 TESTCASE_AUTO(TestBug11371);
101 TESTCASE_AUTO(TestBug11480);
102 TESTCASE_AUTO(NamedCapture);
103 TESTCASE_AUTO(NamedCaptureLimits);
104 TESTCASE_AUTO(TestBug12884);
105 TESTCASE_AUTO(TestBug13631);
106 TESTCASE_AUTO(TestBug13632);
107 TESTCASE_AUTO(TestBug20359);
108 TESTCASE_AUTO(TestBug20863);
109 TESTCASE_AUTO_END;
110}
111
112
113/**
114 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
115 * into ASCII.
116 * @see utext_openUTF8
117 */
118static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
119
120//---------------------------------------------------------------------------
121//
122// Error Checking / Reporting macros used in all of the tests.
123//
124//---------------------------------------------------------------------------
125
126static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
127 int64_t oldIndex = utext_getNativeIndex(text);
128 utext_setNativeIndex(text, 0);
129 char *bufPtr = buf;
130 UChar32 c = utext_next32From(text, 0);
131 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
132 if (0x000020<=c && c<0x00007e) {
133 *bufPtr = c;
134 } else {
135#if 0
136 sprintf(bufPtr,"U+%04X", c);
137 bufPtr+= strlen(bufPtr)-1;
138#else
139 *bufPtr = '%';
140#endif
141 }
142 bufPtr++;
143 c = UTEXT_NEXT32(text);
144 }
145 *bufPtr = 0;
146#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
147 char *ebuf = (char*)malloc(bufLen);
148 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
149 uprv_strncpy(buf, ebuf, bufLen);
150 free((void*)ebuf);
151#endif
152 utext_setNativeIndex(text, oldIndex);
153}
154
155
156static char ASSERT_BUF[1024];
157
158const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
159 if(message.length()==0) {
160 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
161 } else {
162 UnicodeString buf;
163 IntlTest::prettify(message,buf);
164 if(buf.length()==0) {
165 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
166 } else {
167 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
168 if(ASSERT_BUF[0]==0) {
169 ASSERT_BUF[0]=0;
170 for(int32_t i=0;i<buf.length();i++) {
171 UChar ch = buf[i];
172 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
173 }
174 }
175 }
176 }
177 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
178 return ASSERT_BUF;
179}
180
181#define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
182 char buf[200]; \
183 utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
184 logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
185} UPRV_BLOCK_MACRO_END
186
187#define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
188 if (U_FAILURE(status)) { \
189 dataerrln("%s:%d: RegexTest failure. status=%s", \
190 __FILE__, __LINE__, u_errorName(status)); \
191 return; \
192 } \
193} UPRV_BLOCK_MACRO_END
194
195#define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
196 if ((expr)==FALSE) { \
197 errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
198 } \
199} UPRV_BLOCK_MACRO_END
200
201#define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
202 UErrorCode status=U_ZERO_ERROR; \
203 (expr); \
204 if (status!=errcode) { \
205 dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
206 __LINE__, u_errorName(errcode), u_errorName(status)); \
207 } \
208} UPRV_BLOCK_MACRO_END
209
210#define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
211 if (U_FAILURE(status)) { \
212 errln("RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); \
213 } \
214} UPRV_BLOCK_MACRO_END
215
216#define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
217 if ((expr)==FALSE) { \
218 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
219 return; \
220 } \
221} UPRV_BLOCK_MACRO_END
222
223// expected: const char * , restricted to invariant characters.
224// actual: const UnicodeString &
225#define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
226 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
227 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
228 __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
229 } \
230} UPRV_BLOCK_MACRO_END
231
232
233static UBool testUTextEqual(UText *uta, UText *utb) {
234 UChar32 ca = 0;
235 UChar32 cb = 0;
236 utext_setNativeIndex(uta, 0);
237 utext_setNativeIndex(utb, 0);
238 do {
239 ca = utext_next32(uta);
240 cb = utext_next32(utb);
241 if (ca != cb) {
242 break;
243 }
244 } while (ca != U_SENTINEL);
245 return ca == cb;
246}
247
248
249/**
250 * @param expected expected text in UTF-8 (not platform) codepage
251 */
252void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
253 UErrorCode status = U_ZERO_ERROR;
254 UText expectedText = UTEXT_INITIALIZER;
255 utext_openUTF8(&expectedText, expected, -1, &status);
256 if(U_FAILURE(status)) {
257 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
258 return;
259 }
260 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
261 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
262 return;
263 }
264 utext_setNativeIndex(actual, 0);
265 if (!testUTextEqual(&expectedText, actual)) {
266 char buf[201 /*21*/];
267 char expectedBuf[201];
268 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
269 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
270 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
271 }
272 utext_close(&expectedText);
273}
274/**
275 * @param expected invariant (platform local text) input
276 */
277
278void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
279 UErrorCode status = U_ZERO_ERROR;
280 UText expectedText = UTEXT_INITIALIZER;
281 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
282 if(U_FAILURE(status)) {
283 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
284 return;
285 }
286 utext_setNativeIndex(actual, 0);
287 if (!testUTextEqual(&expectedText, actual)) {
288 char buf[201 /*21*/];
289 char expectedBuf[201];
290 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
291 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
292 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
293 }
294 utext_close(&expectedText);
295}
296
297/**
298 * Assumes utf-8 input
299 */
300#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
301/**
302 * Assumes Invariant input
303 */
304#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
305
306/**
307 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
308 * passed into utext_openUTF8. An error will be given if
309 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
310 */
311
312#define INV_BUFSIZ 2048 /* increase this if too small */
313
314static int64_t inv_next=0;
315
316#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
317static char inv_buf[INV_BUFSIZ];
318#endif
319
320static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
321 if(length==-1) length=strlen(inv);
322#if U_CHARSET_FAMILY==U_ASCII_FAMILY
323 inv_next+=length;
324 return utext_openUTF8(ut, inv, length, status);
325#else
326 if(inv_next+length+1>INV_BUFSIZ) {
327 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
328 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
329 *status = U_MEMORY_ALLOCATION_ERROR;
330 return NULL;
331 }
332
333 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
334 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
335 inv_next+=length;
336
337#if 0
338 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
339#endif
340
341 return utext_openUTF8(ut, (const char*)buf, length, status);
342#endif
343}
344
345
346//---------------------------------------------------------------------------
347//
348// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
349// for the LookingAt() and Match() functions.
350//
351// usage:
352// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
353//
354// The expected results are UBool - TRUE or FALSE.
355// The input text is unescaped. The pattern is not.
356//
357//
358//---------------------------------------------------------------------------
359
360#define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
361 doRegexLMTest(pat, text, looking, match, __LINE__); \
362 doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
363} UPRV_BLOCK_MACRO_END
364
365UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
366 const UnicodeString pattern(pat, -1, US_INV);
367 const UnicodeString inputText(text, -1, US_INV);
368 UErrorCode status = U_ZERO_ERROR;
369 UParseError pe;
370 RegexPattern *REPattern = NULL;
371 RegexMatcher *REMatcher = NULL;
372 UBool retVal = TRUE;
373
374 UnicodeString patString(pat, -1, US_INV);
375 REPattern = RegexPattern::compile(patString, 0, pe, status);
376 if (U_FAILURE(status)) {
377 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
378 line, u_errorName(status));
379 return FALSE;
380 }
381 if (line==376) { REPattern->dumpPattern();}
382
383 UnicodeString inputString(inputText);
384 UnicodeString unEscapedInput = inputString.unescape();
385 REMatcher = REPattern->matcher(unEscapedInput, status);
386 if (U_FAILURE(status)) {
387 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
388 line, u_errorName(status));
389 return FALSE;
390 }
391
392 UBool actualmatch;
393 actualmatch = REMatcher->lookingAt(status);
394 if (U_FAILURE(status)) {
395 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
396 line, u_errorName(status));
397 retVal = FALSE;
398 }
399 if (actualmatch != looking) {
400 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
401 retVal = FALSE;
402 }
403
404 status = U_ZERO_ERROR;
405 actualmatch = REMatcher->matches(status);
406 if (U_FAILURE(status)) {
407 errln("RegexTest failure in matches() at line %d. Status = %s\n",
408 line, u_errorName(status));
409 retVal = FALSE;
410 }
411 if (actualmatch != match) {
412 errln("RegexTest: wrong return from matches() at line %d.\n", line);
413 retVal = FALSE;
414 }
415
416 if (retVal == FALSE) {
417 REPattern->dumpPattern();
418 }
419
420 delete REPattern;
421 delete REMatcher;
422 return retVal;
423}
424
425
426UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
427 UText pattern = UTEXT_INITIALIZER;
428 int32_t inputUTF8Length;
429 char *textChars = NULL;
430 UText inputText = UTEXT_INITIALIZER;
431 UErrorCode status = U_ZERO_ERROR;
432 UParseError pe;
433 RegexPattern *REPattern = NULL;
434 RegexMatcher *REMatcher = NULL;
435 UBool retVal = TRUE;
436
437 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
438 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
439 if (U_FAILURE(status)) {
440 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
441 line, u_errorName(status));
442 return FALSE;
443 }
444
445 UnicodeString inputString(text, -1, US_INV);
446 UnicodeString unEscapedInput = inputString.unescape();
447 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
448 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
449
450 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
451 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
452 // UTF-8 does not allow unpaired surrogates, so this could actually happen
453 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
454 return TRUE; // not a failure of the Regex engine
455 }
456 status = U_ZERO_ERROR; // buffer overflow
457 textChars = new char[inputUTF8Length+1];
458 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
459 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
460
461 REMatcher = &REPattern->matcher(status)->reset(&inputText);
462 if (U_FAILURE(status)) {
463 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
464 line, u_errorName(status));
465 return FALSE;
466 }
467
468 UBool actualmatch;
469 actualmatch = REMatcher->lookingAt(status);
470 if (U_FAILURE(status)) {
471 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
472 line, u_errorName(status));
473 retVal = FALSE;
474 }
475 if (actualmatch != looking) {
476 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
477 retVal = FALSE;
478 }
479
480 status = U_ZERO_ERROR;
481 actualmatch = REMatcher->matches(status);
482 if (U_FAILURE(status)) {
483 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
484 line, u_errorName(status));
485 retVal = FALSE;
486 }
487 if (actualmatch != match) {
488 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
489 retVal = FALSE;
490 }
491
492 if (retVal == FALSE) {
493 REPattern->dumpPattern();
494 }
495
496 delete REPattern;
497 delete REMatcher;
498 utext_close(&inputText);
499 utext_close(&pattern);
500 delete[] textChars;
501 return retVal;
502}
503
504
505
506//---------------------------------------------------------------------------
507//
508// REGEX_ERR Macro + invocation function to simplify writing tests
509// regex tests for incorrect patterns
510//
511// usage:
512// REGEX_ERR("pattern", expected error line, column, expected status);
513//
514//---------------------------------------------------------------------------
515#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
516
517void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
518 UErrorCode expectedStatus, int32_t line) {
519 UnicodeString pattern(pat);
520
521 UErrorCode status = U_ZERO_ERROR;
522 UParseError pe;
523 RegexPattern *callerPattern = NULL;
524
525 //
526 // Compile the caller's pattern
527 //
528 UnicodeString patString(pat);
529 callerPattern = RegexPattern::compile(patString, 0, pe, status);
530 if (status != expectedStatus) {
531 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
532 } else {
533 if (status != U_ZERO_ERROR) {
534 if (pe.line != errLine || pe.offset != errCol) {
535 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
536 line, errLine, errCol, pe.line, pe.offset);
537 }
538 }
539 }
540
541 delete callerPattern;
542
543 //
544 // Compile again, using a UTF-8-based UText
545 //
546 UText patternText = UTEXT_INITIALIZER;
547 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
548 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
549 if (status != expectedStatus) {
550 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
551 } else {
552 if (status != U_ZERO_ERROR) {
553 if (pe.line != errLine || pe.offset != errCol) {
554 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
555 line, errLine, errCol, pe.line, pe.offset);
556 }
557 }
558 }
559
560 delete callerPattern;
561 utext_close(&patternText);
562}
563
564
565
566//---------------------------------------------------------------------------
567//
568// Basic Check for basic functionality of regex pattern matching.
569// Avoid the use of REGEX_FIND test macro, which has
570// substantial dependencies on basic Regex functionality.
571//
572//---------------------------------------------------------------------------
573void RegexTest::Basic() {
574
575
576//
577// Debug - slide failing test cases early
578//
579#if 0
580 {
581 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
582 UParseError pe;
583 UErrorCode status = U_ZERO_ERROR;
584 RegexPattern *pattern;
585 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
586 pattern->dumpPattern();
587 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
588 UBool result = m->find();
589 printf("result = %d\n", result);
590 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
591 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
592 }
593 exit(1);
594#endif
595
596
597 //
598 // Pattern with parentheses
599 //
600 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
601 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
602 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
603
604 //
605 // Patterns with *
606 //
607 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
608 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
609 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
612
613 REGEX_TESTLM("a*", "", TRUE, TRUE);
614 REGEX_TESTLM("a*", "b", TRUE, FALSE);
615
616
617 //
618 // Patterns with "."
619 //
620 REGEX_TESTLM(".", "abc", TRUE, FALSE);
621 REGEX_TESTLM("...", "abc", TRUE, TRUE);
622 REGEX_TESTLM("....", "abc", FALSE, FALSE);
623 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
624 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
625 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
626 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
628
629 //
630 // Patterns with * applied to chars at end of literal string
631 //
632 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
633 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
634
635 //
636 // Supplemental chars match as single chars, not a pair of surrogates.
637 //
638 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
639 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
641
642
643 //
644 // UnicodeSets in the pattern
645 //
646 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
647 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
648 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
649 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
652
653 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
654 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
655 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
656 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
657 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
658
659 //
660 // OR operator in patterns
661 //
662 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
663 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
664 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
665 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
666
667 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
669 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
671 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
673
674 //
675 // +
676 //
677 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
678 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
679 REGEX_TESTLM("b+", "", FALSE, FALSE);
680 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
681 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
682 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
683
684 //
685 // ?
686 //
687 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
688 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
689 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
690 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
691 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
692 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
693 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
694 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
695 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
696
697 //
698 // Escape sequences that become single literal chars, handled internally
699 // by ICU's Unescape.
700 //
701
702 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
703 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
704 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
705 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
706 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
707 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
708 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
709 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
710 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
711 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
712
713 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
714 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
715
716 // Escape of special chars in patterns
717 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
718}
719
720
721//---------------------------------------------------------------------------
722//
723// UTextBasic Check for quirks that are specific to the UText
724// implementation.
725//
726//---------------------------------------------------------------------------
727void RegexTest::UTextBasic() {
728 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729 UErrorCode status = U_ZERO_ERROR;
730 UText pattern = UTEXT_INITIALIZER;
731 utext_openUTF8(&pattern, str_abc, -1, &status);
732 RegexMatcher matcher(&pattern, 0, status);
733 REGEX_CHECK_STATUS;
734
735 UText input = UTEXT_INITIALIZER;
736 utext_openUTF8(&input, str_abc, -1, &status);
737 REGEX_CHECK_STATUS;
738 matcher.reset(&input);
739 REGEX_CHECK_STATUS;
740 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
741
742 matcher.reset(matcher.inputText());
743 REGEX_CHECK_STATUS;
744 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
745
746 utext_close(&pattern);
747 utext_close(&input);
748}
749
750
751//---------------------------------------------------------------------------
752//
753// API_Match Test that the API for class RegexMatcher
754// is present and nominally working, but excluding functions
755// implementing replace operations.
756//
757//---------------------------------------------------------------------------
758void RegexTest::API_Match() {
759 UParseError pe;
760 UErrorCode status=U_ZERO_ERROR;
761 int32_t flags = 0;
762
763 //
764 // Debug - slide failing test cases early
765 //
766#if 0
767 {
768 }
769 return;
770#endif
771
772 //
773 // Simple pattern compilation
774 //
775 {
776 UnicodeString re("abc");
777 RegexPattern *pat2;
778 pat2 = RegexPattern::compile(re, flags, pe, status);
779 REGEX_CHECK_STATUS;
780
781 UnicodeString inStr1 = "abcdef this is a test";
782 UnicodeString instr2 = "not abc";
783 UnicodeString empty = "";
784
785
786 //
787 // Matcher creation and reset.
788 //
789 RegexMatcher *m1 = pat2->matcher(inStr1, status);
790 REGEX_CHECK_STATUS;
791 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
792 REGEX_ASSERT(m1->input() == inStr1);
793 m1->reset(instr2);
794 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
795 REGEX_ASSERT(m1->input() == instr2);
796 m1->reset(inStr1);
797 REGEX_ASSERT(m1->input() == inStr1);
798 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
799 m1->reset(empty);
800 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
801 REGEX_ASSERT(m1->input() == empty);
802 REGEX_ASSERT(&m1->pattern() == pat2);
803
804 //
805 // reset(pos, status)
806 //
807 m1->reset(inStr1);
808 m1->reset(4, status);
809 REGEX_CHECK_STATUS;
810 REGEX_ASSERT(m1->input() == inStr1);
811 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
812
813 m1->reset(-1, status);
814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
815 status = U_ZERO_ERROR;
816
817 m1->reset(0, status);
818 REGEX_CHECK_STATUS;
819 status = U_ZERO_ERROR;
820
821 int32_t len = m1->input().length();
822 m1->reset(len-1, status);
823 REGEX_CHECK_STATUS;
824 status = U_ZERO_ERROR;
825
826 m1->reset(len, status);
827 REGEX_CHECK_STATUS;
828 status = U_ZERO_ERROR;
829
830 m1->reset(len+1, status);
831 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
832 status = U_ZERO_ERROR;
833
834 //
835 // match(pos, status)
836 //
837 m1->reset(instr2);
838 REGEX_ASSERT(m1->matches(4, status) == TRUE);
839 m1->reset();
840 REGEX_ASSERT(m1->matches(3, status) == FALSE);
841 m1->reset();
842 REGEX_ASSERT(m1->matches(5, status) == FALSE);
843 REGEX_ASSERT(m1->matches(4, status) == TRUE);
844 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
845 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
846
847 // Match() at end of string should fail, but should not
848 // be an error.
849 status = U_ZERO_ERROR;
850 len = m1->input().length();
851 REGEX_ASSERT(m1->matches(len, status) == FALSE);
852 REGEX_CHECK_STATUS;
853
854 // Match beyond end of string should fail with an error.
855 status = U_ZERO_ERROR;
856 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858
859 // Successful match at end of string.
860 {
861 status = U_ZERO_ERROR;
862 RegexMatcher m("A?", 0, status); // will match zero length string.
863 REGEX_CHECK_STATUS;
864 m.reset(inStr1);
865 len = inStr1.length();
866 REGEX_ASSERT(m.matches(len, status) == TRUE);
867 REGEX_CHECK_STATUS;
868 m.reset(empty);
869 REGEX_ASSERT(m.matches(0, status) == TRUE);
870 REGEX_CHECK_STATUS;
871 }
872
873
874 //
875 // lookingAt(pos, status)
876 //
877 status = U_ZERO_ERROR;
878 m1->reset(instr2); // "not abc"
879 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
880 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
881 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
882 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
883 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885 status = U_ZERO_ERROR;
886 len = m1->input().length();
887 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
888 REGEX_CHECK_STATUS;
889 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
890 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
891
892 delete m1;
893 delete pat2;
894 }
895
896
897 //
898 // Capture Group.
899 // RegexMatcher::start();
900 // RegexMatcher::end();
901 // RegexMatcher::groupCount();
902 //
903 {
904 int32_t flags=0;
905 UParseError pe;
906 UErrorCode status=U_ZERO_ERROR;
907
908 UnicodeString re("01(23(45)67)(.*)");
909 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
910 REGEX_CHECK_STATUS;
911 UnicodeString data = "0123456789";
912
913 RegexMatcher *matcher = pat->matcher(data, status);
914 REGEX_CHECK_STATUS;
915 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
916 static const int32_t matchStarts[] = {0, 2, 4, 8};
917 static const int32_t matchEnds[] = {10, 8, 6, 10};
918 int32_t i;
919 for (i=0; i<4; i++) {
920 int32_t actualStart = matcher->start(i, status);
921 REGEX_CHECK_STATUS;
922 if (actualStart != matchStarts[i]) {
923 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
924 __LINE__, i, matchStarts[i], actualStart);
925 }
926 int32_t actualEnd = matcher->end(i, status);
927 REGEX_CHECK_STATUS;
928 if (actualEnd != matchEnds[i]) {
929 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
930 __LINE__, i, matchEnds[i], actualEnd);
931 }
932 }
933
934 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
935 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
936
937 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
938 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
939 matcher->reset();
940 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
941
942 matcher->lookingAt(status);
943 REGEX_ASSERT(matcher->group(status) == "0123456789");
944 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
945 REGEX_ASSERT(matcher->group(1, status) == "234567" );
946 REGEX_ASSERT(matcher->group(2, status) == "45" );
947 REGEX_ASSERT(matcher->group(3, status) == "89" );
948 REGEX_CHECK_STATUS;
949 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
950 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
951 matcher->reset();
952 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
953
954 delete matcher;
955 delete pat;
956
957 }
958
959 //
960 // find
961 //
962 {
963 int32_t flags=0;
964 UParseError pe;
965 UErrorCode status=U_ZERO_ERROR;
966
967 UnicodeString re("abc");
968 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
969 REGEX_CHECK_STATUS;
970 UnicodeString data = ".abc..abc...abc..";
971 // 012345678901234567
972
973 RegexMatcher *matcher = pat->matcher(data, status);
974 REGEX_CHECK_STATUS;
975 REGEX_ASSERT(matcher->find());
976 REGEX_ASSERT(matcher->start(status) == 1);
977 REGEX_ASSERT(matcher->find());
978 REGEX_ASSERT(matcher->start(status) == 6);
979 REGEX_ASSERT(matcher->find());
980 REGEX_ASSERT(matcher->start(status) == 12);
981 REGEX_ASSERT(matcher->find() == FALSE);
982 REGEX_ASSERT(matcher->find() == FALSE);
983
984 matcher->reset();
985 REGEX_ASSERT(matcher->find());
986 REGEX_ASSERT(matcher->start(status) == 1);
987
988 REGEX_ASSERT(matcher->find(0, status));
989 REGEX_ASSERT(matcher->start(status) == 1);
990 REGEX_ASSERT(matcher->find(1, status));
991 REGEX_ASSERT(matcher->start(status) == 1);
992 REGEX_ASSERT(matcher->find(2, status));
993 REGEX_ASSERT(matcher->start(status) == 6);
994 REGEX_ASSERT(matcher->find(12, status));
995 REGEX_ASSERT(matcher->start(status) == 12);
996 REGEX_ASSERT(matcher->find(13, status) == FALSE);
997 REGEX_ASSERT(matcher->find(16, status) == FALSE);
998 REGEX_ASSERT(matcher->find(17, status) == FALSE);
999 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1000
1001 status = U_ZERO_ERROR;
1002 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1003 status = U_ZERO_ERROR;
1004 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1005
1006 REGEX_ASSERT(matcher->groupCount() == 0);
1007
1008 delete matcher;
1009 delete pat;
1010 }
1011
1012
1013 //
1014 // find, with \G in pattern (true if at the end of a previous match).
1015 //
1016 {
1017 int32_t flags=0;
1018 UParseError pe;
1019 UErrorCode status=U_ZERO_ERROR;
1020
1021 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1022 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1023 REGEX_CHECK_STATUS;
1024 UnicodeString data = ".abcabc.abc..";
1025 // 012345678901234567
1026
1027 RegexMatcher *matcher = pat->matcher(data, status);
1028 REGEX_CHECK_STATUS;
1029 REGEX_ASSERT(matcher->find());
1030 REGEX_ASSERT(matcher->start(status) == 0);
1031 REGEX_ASSERT(matcher->start(1, status) == -1);
1032 REGEX_ASSERT(matcher->start(2, status) == 1);
1033
1034 REGEX_ASSERT(matcher->find());
1035 REGEX_ASSERT(matcher->start(status) == 4);
1036 REGEX_ASSERT(matcher->start(1, status) == 4);
1037 REGEX_ASSERT(matcher->start(2, status) == -1);
1038 REGEX_CHECK_STATUS;
1039
1040 delete matcher;
1041 delete pat;
1042 }
1043
1044 //
1045 // find with zero length matches, match position should bump ahead
1046 // to prevent loops.
1047 //
1048 {
1049 int32_t i;
1050 UErrorCode status=U_ZERO_ERROR;
1051 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1052 // using an always-true look-ahead.
1053 REGEX_CHECK_STATUS;
1054 UnicodeString s(" ");
1055 m.reset(s);
1056 for (i=0; ; i++) {
1057 if (m.find() == FALSE) {
1058 break;
1059 }
1060 REGEX_ASSERT(m.start(status) == i);
1061 REGEX_ASSERT(m.end(status) == i);
1062 }
1063 REGEX_ASSERT(i==5);
1064
1065 // Check that the bump goes over surrogate pairs OK
1066 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1067 s = s.unescape();
1068 m.reset(s);
1069 for (i=0; ; i+=2) {
1070 if (m.find() == FALSE) {
1071 break;
1072 }
1073 REGEX_ASSERT(m.start(status) == i);
1074 REGEX_ASSERT(m.end(status) == i);
1075 }
1076 REGEX_ASSERT(i==10);
1077 }
1078 {
1079 // find() loop breaking test.
1080 // with pattern of /.?/, should see a series of one char matches, then a single
1081 // match of zero length at the end of the input string.
1082 int32_t i;
1083 UErrorCode status=U_ZERO_ERROR;
1084 RegexMatcher m(".?", 0, status);
1085 REGEX_CHECK_STATUS;
1086 UnicodeString s(" ");
1087 m.reset(s);
1088 for (i=0; ; i++) {
1089 if (m.find() == FALSE) {
1090 break;
1091 }
1092 REGEX_ASSERT(m.start(status) == i);
1093 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1094 }
1095 REGEX_ASSERT(i==5);
1096 }
1097
1098
1099 //
1100 // Matchers with no input string behave as if they had an empty input string.
1101 //
1102
1103 {
1104 UErrorCode status = U_ZERO_ERROR;
1105 RegexMatcher m(".?", 0, status);
1106 REGEX_CHECK_STATUS;
1107 REGEX_ASSERT(m.find());
1108 REGEX_ASSERT(m.start(status) == 0);
1109 REGEX_ASSERT(m.input() == "");
1110 }
1111 {
1112 UErrorCode status = U_ZERO_ERROR;
1113 RegexPattern *p = RegexPattern::compile(".", 0, status);
1114 RegexMatcher *m = p->matcher(status);
1115 REGEX_CHECK_STATUS;
1116
1117 REGEX_ASSERT(m->find() == FALSE);
1118 REGEX_ASSERT(m->input() == "");
1119 delete m;
1120 delete p;
1121 }
1122
1123 //
1124 // Regions
1125 //
1126 {
1127 UErrorCode status = U_ZERO_ERROR;
1128 UnicodeString testString("This is test data");
1129 RegexMatcher m(".*", testString, 0, status);
1130 REGEX_CHECK_STATUS;
1131 REGEX_ASSERT(m.regionStart() == 0);
1132 REGEX_ASSERT(m.regionEnd() == testString.length());
1133 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1134 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1135
1136 m.region(2,4, status);
1137 REGEX_CHECK_STATUS;
1138 REGEX_ASSERT(m.matches(status));
1139 REGEX_ASSERT(m.start(status)==2);
1140 REGEX_ASSERT(m.end(status)==4);
1141 REGEX_CHECK_STATUS;
1142
1143 m.reset();
1144 REGEX_ASSERT(m.regionStart() == 0);
1145 REGEX_ASSERT(m.regionEnd() == testString.length());
1146
1147 UnicodeString shorterString("short");
1148 m.reset(shorterString);
1149 REGEX_ASSERT(m.regionStart() == 0);
1150 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1151
1152 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1153 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1154 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1155 REGEX_ASSERT(&m == &m.reset());
1156 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1157
1158 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1159 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1160 REGEX_ASSERT(&m == &m.reset());
1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1164 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1165 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1166 REGEX_ASSERT(&m == &m.reset());
1167 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1168
1169 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1170 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1171 REGEX_ASSERT(&m == &m.reset());
1172 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1173
1174 }
1175
1176 //
1177 // hitEnd() and requireEnd()
1178 //
1179 {
1180 UErrorCode status = U_ZERO_ERROR;
1181 UnicodeString testString("aabb");
1182 RegexMatcher m1(".*", testString, 0, status);
1183 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1184 REGEX_ASSERT(m1.hitEnd() == TRUE);
1185 REGEX_ASSERT(m1.requireEnd() == FALSE);
1186 REGEX_CHECK_STATUS;
1187
1188 status = U_ZERO_ERROR;
1189 RegexMatcher m2("a*", testString, 0, status);
1190 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1191 REGEX_ASSERT(m2.hitEnd() == FALSE);
1192 REGEX_ASSERT(m2.requireEnd() == FALSE);
1193 REGEX_CHECK_STATUS;
1194
1195 status = U_ZERO_ERROR;
1196 RegexMatcher m3(".*$", testString, 0, status);
1197 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1198 REGEX_ASSERT(m3.hitEnd() == TRUE);
1199 REGEX_ASSERT(m3.requireEnd() == TRUE);
1200 REGEX_CHECK_STATUS;
1201 }
1202
1203
1204 //
1205 // Compilation error on reset with UChar *
1206 // These were a hazard that people were stumbling over with runtime errors.
1207 // Changed them to compiler errors by adding private methods that more closely
1208 // matched the incorrect use of the functions.
1209 //
1210#if 0
1211 {
1212 UErrorCode status = U_ZERO_ERROR;
1213 UChar ucharString[20];
1214 RegexMatcher m(".", 0, status);
1215 m.reset(ucharString); // should not compile.
1216
1217 RegexPattern *p = RegexPattern::compile(".", 0, status);
1218 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1219
1220 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1221 }
1222#endif
1223
1224 //
1225 // Time Outs.
1226 // Note: These tests will need to be changed when the regexp engine is
1227 // able to detect and cut short the exponential time behavior on
1228 // this type of match.
1229 //
1230 {
1231 UErrorCode status = U_ZERO_ERROR;
1232 // Enough 'a's in the string to cause the match to time out.
1233 // (Each on additonal 'a' doubles the time)
1234 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1235 RegexMatcher matcher("(a+)+b", testString, 0, status);
1236 REGEX_CHECK_STATUS;
1237 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1238 matcher.setTimeLimit(100, status);
1239 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1240 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1241 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1242 }
1243 {
1244 UErrorCode status = U_ZERO_ERROR;
1245 // Few enough 'a's to slip in under the time limit.
1246 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1247 RegexMatcher matcher("(a+)+b", testString, 0, status);
1248 REGEX_CHECK_STATUS;
1249 matcher.setTimeLimit(100, status);
1250 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1251 REGEX_CHECK_STATUS;
1252 }
1253
1254 //
1255 // Stack Limits
1256 //
1257 {
1258 UErrorCode status = U_ZERO_ERROR;
1259 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1260
1261 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262 // of the '+', and makes the stack frames larger.
1263 RegexMatcher matcher("(A)+A$", testString, 0, status);
1264
1265 // With the default stack, this match should fail to run
1266 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1267 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1268
1269 // With unlimited stack, it should run
1270 status = U_ZERO_ERROR;
1271 matcher.setStackLimit(0, status);
1272 REGEX_CHECK_STATUS;
1273 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1274 REGEX_CHECK_STATUS;
1275 REGEX_ASSERT(matcher.getStackLimit() == 0);
1276
1277 // With a limited stack, it the match should fail
1278 status = U_ZERO_ERROR;
1279 matcher.setStackLimit(10000, status);
1280 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1281 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1282 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1283 }
1284
1285 // A pattern that doesn't save state should work with
1286 // a minimal sized stack
1287 {
1288 UErrorCode status = U_ZERO_ERROR;
1289 UnicodeString testString = "abc";
1290 RegexMatcher matcher("abc", testString, 0, status);
1291 REGEX_CHECK_STATUS;
1292 matcher.setStackLimit(30, status);
1293 REGEX_CHECK_STATUS;
1294 REGEX_ASSERT(matcher.matches(status) == TRUE);
1295 REGEX_CHECK_STATUS;
1296 REGEX_ASSERT(matcher.getStackLimit() == 30);
1297
1298 // Negative stack sizes should fail
1299 status = U_ZERO_ERROR;
1300 matcher.setStackLimit(1000, status);
1301 REGEX_CHECK_STATUS;
1302 matcher.setStackLimit(-1, status);
1303 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1304 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1305 }
1306
1307
1308}
1309
1310
1311
1312
1313
1314
1315//---------------------------------------------------------------------------
1316//
1317// API_Replace API test for class RegexMatcher, testing the
1318// Replace family of functions.
1319//
1320//---------------------------------------------------------------------------
1321void RegexTest::API_Replace() {
1322 //
1323 // Replace
1324 //
1325 int32_t flags=0;
1326 UParseError pe;
1327 UErrorCode status=U_ZERO_ERROR;
1328
1329 UnicodeString re("abc");
1330 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1331 REGEX_CHECK_STATUS;
1332 UnicodeString data = ".abc..abc...abc..";
1333 // 012345678901234567
1334 RegexMatcher *matcher = pat->matcher(data, status);
1335
1336 //
1337 // Plain vanilla matches.
1338 //
1339 UnicodeString dest;
1340 dest = matcher->replaceFirst("yz", status);
1341 REGEX_CHECK_STATUS;
1342 REGEX_ASSERT(dest == ".yz..abc...abc..");
1343
1344 dest = matcher->replaceAll("yz", status);
1345 REGEX_CHECK_STATUS;
1346 REGEX_ASSERT(dest == ".yz..yz...yz..");
1347
1348 //
1349 // Plain vanilla non-matches.
1350 //
1351 UnicodeString d2 = ".abx..abx...abx..";
1352 matcher->reset(d2);
1353 dest = matcher->replaceFirst("yz", status);
1354 REGEX_CHECK_STATUS;
1355 REGEX_ASSERT(dest == ".abx..abx...abx..");
1356
1357 dest = matcher->replaceAll("yz", status);
1358 REGEX_CHECK_STATUS;
1359 REGEX_ASSERT(dest == ".abx..abx...abx..");
1360
1361 //
1362 // Empty source string
1363 //
1364 UnicodeString d3 = "";
1365 matcher->reset(d3);
1366 dest = matcher->replaceFirst("yz", status);
1367 REGEX_CHECK_STATUS;
1368 REGEX_ASSERT(dest == "");
1369
1370 dest = matcher->replaceAll("yz", status);
1371 REGEX_CHECK_STATUS;
1372 REGEX_ASSERT(dest == "");
1373
1374 //
1375 // Empty substitution string
1376 //
1377 matcher->reset(data); // ".abc..abc...abc.."
1378 dest = matcher->replaceFirst("", status);
1379 REGEX_CHECK_STATUS;
1380 REGEX_ASSERT(dest == "...abc...abc..");
1381
1382 dest = matcher->replaceAll("", status);
1383 REGEX_CHECK_STATUS;
1384 REGEX_ASSERT(dest == "........");
1385
1386 //
1387 // match whole string
1388 //
1389 UnicodeString d4 = "abc";
1390 matcher->reset(d4);
1391 dest = matcher->replaceFirst("xyz", status);
1392 REGEX_CHECK_STATUS;
1393 REGEX_ASSERT(dest == "xyz");
1394
1395 dest = matcher->replaceAll("xyz", status);
1396 REGEX_CHECK_STATUS;
1397 REGEX_ASSERT(dest == "xyz");
1398
1399 //
1400 // Capture Group, simple case
1401 //
1402 UnicodeString re2("a(..)");
1403 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1404 REGEX_CHECK_STATUS;
1405 UnicodeString d5 = "abcdefg";
1406 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1407 REGEX_CHECK_STATUS;
1408 dest = matcher2->replaceFirst("$1$1", status);
1409 REGEX_CHECK_STATUS;
1410 REGEX_ASSERT(dest == "bcbcdefg");
1411
1412 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1413 REGEX_CHECK_STATUS;
1414 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1415
1416 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1417 REGEX_ASSERT(U_FAILURE(status));
1418 status = U_ZERO_ERROR;
1419
1420 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421 replacement = replacement.unescape();
1422 dest = matcher2->replaceFirst(replacement, status);
1423 REGEX_CHECK_STATUS;
1424 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1425
1426 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1427
1428
1429 //
1430 // Replacement String with \u hex escapes
1431 //
1432 {
1433 UnicodeString src = "abc 1 abc 2 abc 3";
1434 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1435 matcher->reset(src);
1436 UnicodeString result = matcher->replaceAll(substitute, status);
1437 REGEX_CHECK_STATUS;
1438 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1439 }
1440 {
1441 UnicodeString src = "abc !";
1442 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1443 matcher->reset(src);
1444 UnicodeString result = matcher->replaceAll(substitute, status);
1445 REGEX_CHECK_STATUS;
1446 UnicodeString expected = UnicodeString("--");
1447 expected.append((UChar32)0x10000);
1448 expected.append("-- !");
1449 REGEX_ASSERT(result == expected);
1450 }
1451 // TODO: need more through testing of capture substitutions.
1452
1453 // Bug 4057
1454 //
1455 {
1456 status = U_ZERO_ERROR;
1457 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1458 RegexMatcher m("ss(.*?)ee", 0, status);
1459 REGEX_CHECK_STATUS;
1460 UnicodeString result;
1461
1462 // Multiple finds do NOT bump up the previous appendReplacement postion.
1463 m.reset(s);
1464 m.find();
1465 m.find();
1466 m.appendReplacement(result, "ooh", status);
1467 REGEX_CHECK_STATUS;
1468 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1469
1470 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471 status = U_ZERO_ERROR;
1472 result.truncate(0);
1473 m.reset(10, status);
1474 m.find();
1475 m.find();
1476 m.appendReplacement(result, "ooh", status);
1477 REGEX_CHECK_STATUS;
1478 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1479
1480 // find() at interior of string, appendReplacemnt still starts at beginning.
1481 status = U_ZERO_ERROR;
1482 result.truncate(0);
1483 m.reset();
1484 m.find(10, status);
1485 m.find();
1486 m.appendReplacement(result, "ooh", status);
1487 REGEX_CHECK_STATUS;
1488 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1489
1490 m.appendTail(result);
1491 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1492
1493 }
1494
1495 delete matcher2;
1496 delete pat2;
1497 delete matcher;
1498 delete pat;
1499}
1500
1501
1502//---------------------------------------------------------------------------
1503//
1504// API_Pattern Test that the API for class RegexPattern is
1505// present and nominally working.
1506//
1507//---------------------------------------------------------------------------
1508void RegexTest::API_Pattern() {
1509 RegexPattern pata; // Test default constructor to not crash.
1510 RegexPattern patb;
1511
1512 REGEX_ASSERT(pata == patb);
1513 REGEX_ASSERT(pata == pata);
1514
1515 UnicodeString re1("abc[a-l][m-z]");
1516 UnicodeString re2("def");
1517 UErrorCode status = U_ZERO_ERROR;
1518 UParseError pe;
1519
1520 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1521 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1522 REGEX_CHECK_STATUS;
1523 REGEX_ASSERT(*pat1 == *pat1);
1524 REGEX_ASSERT(*pat1 != pata);
1525
1526 // Assign
1527 patb = *pat1;
1528 REGEX_ASSERT(patb == *pat1);
1529
1530 // Copy Construct
1531 RegexPattern patc(*pat1);
1532 REGEX_ASSERT(patc == *pat1);
1533 REGEX_ASSERT(patb == patc);
1534 REGEX_ASSERT(pat1 != pat2);
1535 patb = *pat2;
1536 REGEX_ASSERT(patb != patc);
1537 REGEX_ASSERT(patb == *pat2);
1538
1539 // Compile with no flags.
1540 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1541 REGEX_ASSERT(*pat1a == *pat1);
1542
1543 REGEX_ASSERT(pat1a->flags() == 0);
1544
1545 // Compile with different flags should be not equal
1546 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1547 REGEX_CHECK_STATUS;
1548
1549 REGEX_ASSERT(*pat1b != *pat1a);
1550 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1551 REGEX_ASSERT(pat1a->flags() == 0);
1552 delete pat1b;
1553
1554 // clone
1555 RegexPattern *pat1c = pat1->clone();
1556 REGEX_ASSERT(*pat1c == *pat1);
1557 REGEX_ASSERT(*pat1c != *pat2);
1558
1559 delete pat1c;
1560 delete pat1a;
1561 delete pat1;
1562 delete pat2;
1563
1564
1565 //
1566 // Verify that a matcher created from a cloned pattern works.
1567 // (Jitterbug 3423)
1568 //
1569 {
1570 UErrorCode status = U_ZERO_ERROR;
1571 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1572 RegexPattern *pClone = pSource->clone();
1573 delete pSource;
1574 RegexMatcher *mFromClone = pClone->matcher(status);
1575 REGEX_CHECK_STATUS;
1576 UnicodeString s = "Hello World";
1577 mFromClone->reset(s);
1578 REGEX_ASSERT(mFromClone->find() == TRUE);
1579 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1580 REGEX_ASSERT(mFromClone->find() == TRUE);
1581 REGEX_ASSERT(mFromClone->group(status) == "World");
1582 REGEX_ASSERT(mFromClone->find() == FALSE);
1583 delete mFromClone;
1584 delete pClone;
1585 }
1586
1587 //
1588 // matches convenience API
1589 //
1590 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1591 REGEX_CHECK_STATUS;
1592 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1593 REGEX_CHECK_STATUS;
1594 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1595 REGEX_CHECK_STATUS;
1596 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1597 REGEX_CHECK_STATUS;
1598 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1599 REGEX_CHECK_STATUS;
1600 status = U_INDEX_OUTOFBOUNDS_ERROR;
1601 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1602 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1603
1604
1605 //
1606 // Split()
1607 //
1608 status = U_ZERO_ERROR;
1609 pat1 = RegexPattern::compile(" +", pe, status);
1610 REGEX_CHECK_STATUS;
1611 UnicodeString fields[10];
1612
1613 int32_t n;
1614 n = pat1->split("Now is the time", fields, 10, status);
1615 REGEX_CHECK_STATUS;
1616 REGEX_ASSERT(n==4);
1617 REGEX_ASSERT(fields[0]=="Now");
1618 REGEX_ASSERT(fields[1]=="is");
1619 REGEX_ASSERT(fields[2]=="the");
1620 REGEX_ASSERT(fields[3]=="time");
1621 REGEX_ASSERT(fields[4]=="");
1622
1623 n = pat1->split("Now is the time", fields, 2, status);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(n==2);
1626 REGEX_ASSERT(fields[0]=="Now");
1627 REGEX_ASSERT(fields[1]=="is the time");
1628 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1629
1630 fields[1] = "*";
1631 status = U_ZERO_ERROR;
1632 n = pat1->split("Now is the time", fields, 1, status);
1633 REGEX_CHECK_STATUS;
1634 REGEX_ASSERT(n==1);
1635 REGEX_ASSERT(fields[0]=="Now is the time");
1636 REGEX_ASSERT(fields[1]=="*");
1637 status = U_ZERO_ERROR;
1638
1639 n = pat1->split(" Now is the time ", fields, 10, status);
1640 REGEX_CHECK_STATUS;
1641 REGEX_ASSERT(n==6);
1642 REGEX_ASSERT(fields[0]=="");
1643 REGEX_ASSERT(fields[1]=="Now");
1644 REGEX_ASSERT(fields[2]=="is");
1645 REGEX_ASSERT(fields[3]=="the");
1646 REGEX_ASSERT(fields[4]=="time");
1647 REGEX_ASSERT(fields[5]=="");
1648
1649 n = pat1->split(" ", fields, 10, status);
1650 REGEX_CHECK_STATUS;
1651 REGEX_ASSERT(n==2);
1652 REGEX_ASSERT(fields[0]=="");
1653 REGEX_ASSERT(fields[1]=="");
1654
1655 fields[0] = "foo";
1656 n = pat1->split("", fields, 10, status);
1657 REGEX_CHECK_STATUS;
1658 REGEX_ASSERT(n==0);
1659 REGEX_ASSERT(fields[0]=="foo");
1660
1661 delete pat1;
1662
1663 // split, with a pattern with (capture)
1664 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1665 REGEX_CHECK_STATUS;
1666
1667 status = U_ZERO_ERROR;
1668 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1669 REGEX_CHECK_STATUS;
1670 REGEX_ASSERT(n==7);
1671 REGEX_ASSERT(fields[0]=="");
1672 REGEX_ASSERT(fields[1]=="a");
1673 REGEX_ASSERT(fields[2]=="Now is ");
1674 REGEX_ASSERT(fields[3]=="b");
1675 REGEX_ASSERT(fields[4]=="the time");
1676 REGEX_ASSERT(fields[5]=="c");
1677 REGEX_ASSERT(fields[6]=="");
1678 REGEX_ASSERT(status==U_ZERO_ERROR);
1679
1680 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1681 REGEX_CHECK_STATUS;
1682 REGEX_ASSERT(n==7);
1683 REGEX_ASSERT(fields[0]==" ");
1684 REGEX_ASSERT(fields[1]=="a");
1685 REGEX_ASSERT(fields[2]=="Now is ");
1686 REGEX_ASSERT(fields[3]=="b");
1687 REGEX_ASSERT(fields[4]=="the time");
1688 REGEX_ASSERT(fields[5]=="c");
1689 REGEX_ASSERT(fields[6]=="");
1690
1691 status = U_ZERO_ERROR;
1692 fields[6] = "foo";
1693 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1694 REGEX_CHECK_STATUS;
1695 REGEX_ASSERT(n==6);
1696 REGEX_ASSERT(fields[0]==" ");
1697 REGEX_ASSERT(fields[1]=="a");
1698 REGEX_ASSERT(fields[2]=="Now is ");
1699 REGEX_ASSERT(fields[3]=="b");
1700 REGEX_ASSERT(fields[4]=="the time");
1701 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1702 REGEX_ASSERT(fields[6]=="foo");
1703
1704 status = U_ZERO_ERROR;
1705 fields[5] = "foo";
1706 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1707 REGEX_CHECK_STATUS;
1708 REGEX_ASSERT(n==5);
1709 REGEX_ASSERT(fields[0]==" ");
1710 REGEX_ASSERT(fields[1]=="a");
1711 REGEX_ASSERT(fields[2]=="Now is ");
1712 REGEX_ASSERT(fields[3]=="b");
1713 REGEX_ASSERT(fields[4]=="the time<c>");
1714 REGEX_ASSERT(fields[5]=="foo");
1715
1716 status = U_ZERO_ERROR;
1717 fields[5] = "foo";
1718 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1719 REGEX_CHECK_STATUS;
1720 REGEX_ASSERT(n==5);
1721 REGEX_ASSERT(fields[0]==" ");
1722 REGEX_ASSERT(fields[1]=="a");
1723 REGEX_ASSERT(fields[2]=="Now is ");
1724 REGEX_ASSERT(fields[3]=="b");
1725 REGEX_ASSERT(fields[4]=="the time");
1726 REGEX_ASSERT(fields[5]=="foo");
1727
1728 status = U_ZERO_ERROR;
1729 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1730 REGEX_CHECK_STATUS;
1731 REGEX_ASSERT(n==4);
1732 REGEX_ASSERT(fields[0]==" ");
1733 REGEX_ASSERT(fields[1]=="a");
1734 REGEX_ASSERT(fields[2]=="Now is ");
1735 REGEX_ASSERT(fields[3]=="the time<c>");
1736 status = U_ZERO_ERROR;
1737 delete pat1;
1738
1739 pat1 = RegexPattern::compile("([-,])", pe, status);
1740 REGEX_CHECK_STATUS;
1741 n = pat1->split("1-10,20", fields, 10, status);
1742 REGEX_CHECK_STATUS;
1743 REGEX_ASSERT(n==5);
1744 REGEX_ASSERT(fields[0]=="1");
1745 REGEX_ASSERT(fields[1]=="-");
1746 REGEX_ASSERT(fields[2]=="10");
1747 REGEX_ASSERT(fields[3]==",");
1748 REGEX_ASSERT(fields[4]=="20");
1749 delete pat1;
1750
1751 // Test split of string with empty trailing fields
1752 pat1 = RegexPattern::compile(",", pe, status);
1753 REGEX_CHECK_STATUS;
1754 n = pat1->split("a,b,c,", fields, 10, status);
1755 REGEX_CHECK_STATUS;
1756 REGEX_ASSERT(n==4);
1757 REGEX_ASSERT(fields[0]=="a");
1758 REGEX_ASSERT(fields[1]=="b");
1759 REGEX_ASSERT(fields[2]=="c");
1760 REGEX_ASSERT(fields[3]=="");
1761
1762 n = pat1->split("a,,,", fields, 10, status);
1763 REGEX_CHECK_STATUS;
1764 REGEX_ASSERT(n==4);
1765 REGEX_ASSERT(fields[0]=="a");
1766 REGEX_ASSERT(fields[1]=="");
1767 REGEX_ASSERT(fields[2]=="");
1768 REGEX_ASSERT(fields[3]=="");
1769 delete pat1;
1770
1771 // Split Separator with zero length match.
1772 pat1 = RegexPattern::compile(":?", pe, status);
1773 REGEX_CHECK_STATUS;
1774 n = pat1->split("abc", fields, 10, status);
1775 REGEX_CHECK_STATUS;
1776 REGEX_ASSERT(n==5);
1777 REGEX_ASSERT(fields[0]=="");
1778 REGEX_ASSERT(fields[1]=="a");
1779 REGEX_ASSERT(fields[2]=="b");
1780 REGEX_ASSERT(fields[3]=="c");
1781 REGEX_ASSERT(fields[4]=="");
1782
1783 delete pat1;
1784
1785 //
1786 // RegexPattern::pattern()
1787 //
1788 pat1 = new RegexPattern();
1789 REGEX_ASSERT(pat1->pattern() == "");
1790 delete pat1;
1791
1792 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1793 REGEX_CHECK_STATUS;
1794 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1795 delete pat1;
1796
1797
1798 //
1799 // classID functions
1800 //
1801 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1802 REGEX_CHECK_STATUS;
1803 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1804 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1805 UnicodeString Hello("Hello, world.");
1806 RegexMatcher *m = pat1->matcher(Hello, status);
1807 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1808 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1810 delete m;
1811 delete pat1;
1812
1813}
1814
1815//---------------------------------------------------------------------------
1816//
1817// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1818// is present and working, but excluding functions
1819// implementing replace operations.
1820//
1821//---------------------------------------------------------------------------
1822void RegexTest::API_Match_UTF8() {
1823 UParseError pe;
1824 UErrorCode status=U_ZERO_ERROR;
1825 int32_t flags = 0;
1826
1827 //
1828 // Debug - slide failing test cases early
1829 //
1830#if 0
1831 {
1832 }
1833 return;
1834#endif
1835
1836 //
1837 // Simple pattern compilation
1838 //
1839 {
1840 UText re = UTEXT_INITIALIZER;
1841 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1842 REGEX_VERBOSE_TEXT(&re);
1843 RegexPattern *pat2;
1844 pat2 = RegexPattern::compile(&re, flags, pe, status);
1845 REGEX_CHECK_STATUS;
1846
1847 UText input1 = UTEXT_INITIALIZER;
1848 UText input2 = UTEXT_INITIALIZER;
1849 UText empty = UTEXT_INITIALIZER;
1850 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1851 REGEX_VERBOSE_TEXT(&input1);
1852 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1853 REGEX_VERBOSE_TEXT(&input2);
1854 utext_openUChars(&empty, NULL, 0, &status);
1855
1856 int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1857 int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1858
1859
1860 //
1861 // Matcher creation and reset.
1862 //
1863 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1864 REGEX_CHECK_STATUS;
1865 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1866 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1868 m1->reset(&input2);
1869 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1870 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1872 m1->reset(&input1);
1873 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1874 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1875 m1->reset(&empty);
1876 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1877 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1878
1879 //
1880 // reset(pos, status)
1881 //
1882 m1->reset(&input1);
1883 m1->reset(4, status);
1884 REGEX_CHECK_STATUS;
1885 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1886 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1887
1888 m1->reset(-1, status);
1889 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1890 status = U_ZERO_ERROR;
1891
1892 m1->reset(0, status);
1893 REGEX_CHECK_STATUS;
1894 status = U_ZERO_ERROR;
1895
1896 m1->reset(input1Len-1, status);
1897 REGEX_CHECK_STATUS;
1898 status = U_ZERO_ERROR;
1899
1900 m1->reset(input1Len, status);
1901 REGEX_CHECK_STATUS;
1902 status = U_ZERO_ERROR;
1903
1904 m1->reset(input1Len+1, status);
1905 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1906 status = U_ZERO_ERROR;
1907
1908 //
1909 // match(pos, status)
1910 //
1911 m1->reset(&input2);
1912 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913 m1->reset();
1914 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1915 m1->reset();
1916 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1917 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1918 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921 // Match() at end of string should fail, but should not
1922 // be an error.
1923 status = U_ZERO_ERROR;
1924 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1925 REGEX_CHECK_STATUS;
1926
1927 // Match beyond end of string should fail with an error.
1928 status = U_ZERO_ERROR;
1929 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1930 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932 // Successful match at end of string.
1933 {
1934 status = U_ZERO_ERROR;
1935 RegexMatcher m("A?", 0, status); // will match zero length string.
1936 REGEX_CHECK_STATUS;
1937 m.reset(&input1);
1938 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1939 REGEX_CHECK_STATUS;
1940 m.reset(&empty);
1941 REGEX_ASSERT(m.matches(0, status) == TRUE);
1942 REGEX_CHECK_STATUS;
1943 }
1944
1945
1946 //
1947 // lookingAt(pos, status)
1948 //
1949 status = U_ZERO_ERROR;
1950 m1->reset(&input2); // "not abc"
1951 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1952 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1953 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1954 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1955 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1956 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957 status = U_ZERO_ERROR;
1958 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1959 REGEX_CHECK_STATUS;
1960 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1961 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1962
1963 delete m1;
1964 delete pat2;
1965
1966 utext_close(&re);
1967 utext_close(&input1);
1968 utext_close(&input2);
1969 utext_close(&empty);
1970 }
1971
1972
1973 //
1974 // Capture Group.
1975 // RegexMatcher::start();
1976 // RegexMatcher::end();
1977 // RegexMatcher::groupCount();
1978 //
1979 {
1980 int32_t flags=0;
1981 UParseError pe;
1982 UErrorCode status=U_ZERO_ERROR;
1983 UText re=UTEXT_INITIALIZER;
1984 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1986
1987 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1988 REGEX_CHECK_STATUS;
1989
1990 UText input = UTEXT_INITIALIZER;
1991 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992 utext_openUTF8(&input, str_0123456789, -1, &status);
1993
1994 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1995 REGEX_CHECK_STATUS;
1996 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1997 static const int32_t matchStarts[] = {0, 2, 4, 8};
1998 static const int32_t matchEnds[] = {10, 8, 6, 10};
1999 int32_t i;
2000 for (i=0; i<4; i++) {
2001 int32_t actualStart = matcher->start(i, status);
2002 REGEX_CHECK_STATUS;
2003 if (actualStart != matchStarts[i]) {
2004 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2005 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2006 }
2007 int32_t actualEnd = matcher->end(i, status);
2008 REGEX_CHECK_STATUS;
2009 if (actualEnd != matchEnds[i]) {
2010 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2011 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2012 }
2013 }
2014
2015 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2016 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2017
2018 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2019 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020 matcher->reset();
2021 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2022
2023 matcher->lookingAt(status);
2024
2025 UnicodeString dest;
2026 UText destText = UTEXT_INITIALIZER;
2027 utext_openUnicodeString(&destText, &dest, &status);
2028 UText *result;
2029 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030 // Test shallow-clone API
2031 int64_t group_len;
2032 result = matcher->group((UText *)NULL, group_len, status);
2033 REGEX_CHECK_STATUS;
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035 utext_close(result);
2036 result = matcher->group(0, &destText, group_len, status);
2037 REGEX_CHECK_STATUS;
2038 REGEX_ASSERT(result == &destText);
2039 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2040 // destText is now immutable, reopen it
2041 utext_close(&destText);
2042 utext_openUnicodeString(&destText, &dest, &status);
2043
2044 int64_t length;
2045 result = matcher->group(0, NULL, length, status);
2046 REGEX_CHECK_STATUS;
2047 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2048 utext_close(result);
2049 result = matcher->group(0, &destText, length, status);
2050 REGEX_CHECK_STATUS;
2051 REGEX_ASSERT(result == &destText);
2052 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2053 REGEX_ASSERT(length == 10);
2054 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2055
2056 // Capture Group 1 == "234567"
2057 result = matcher->group(1, NULL, length, status);
2058 REGEX_CHECK_STATUS;
2059 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2060 REGEX_ASSERT(length == 6);
2061 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2062 utext_close(result);
2063
2064 result = matcher->group(1, &destText, length, status);
2065 REGEX_CHECK_STATUS;
2066 REGEX_ASSERT(result == &destText);
2067 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2068 REGEX_ASSERT(length == 6);
2069 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2070 utext_close(result);
2071
2072 // Capture Group 2 == "45"
2073 result = matcher->group(2, NULL, length, status);
2074 REGEX_CHECK_STATUS;
2075 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2076 REGEX_ASSERT(length == 2);
2077 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2078 utext_close(result);
2079
2080 result = matcher->group(2, &destText, length, status);
2081 REGEX_CHECK_STATUS;
2082 REGEX_ASSERT(result == &destText);
2083 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2084 REGEX_ASSERT(length == 2);
2085 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086 utext_close(result);
2087
2088 // Capture Group 3 == "89"
2089 result = matcher->group(3, NULL, length, status);
2090 REGEX_CHECK_STATUS;
2091 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2092 REGEX_ASSERT(length == 2);
2093 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2094 utext_close(result);
2095
2096 result = matcher->group(3, &destText, length, status);
2097 REGEX_CHECK_STATUS;
2098 REGEX_ASSERT(result == &destText);
2099 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2100 REGEX_ASSERT(length == 2);
2101 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2102 utext_close(result);
2103
2104 // Capture Group number out of range.
2105 status = U_ZERO_ERROR;
2106 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2107 status = U_ZERO_ERROR;
2108 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2109 status = U_ZERO_ERROR;
2110 matcher->reset();
2111 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2112
2113 delete matcher;
2114 delete pat;
2115
2116 utext_close(&destText);
2117 utext_close(&input);
2118 utext_close(&re);
2119 }
2120
2121 //
2122 // find
2123 //
2124 {
2125 int32_t flags=0;
2126 UParseError pe;
2127 UErrorCode status=U_ZERO_ERROR;
2128 UText re=UTEXT_INITIALIZER;
2129 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2130 utext_openUTF8(&re, str_abc, -1, &status);
2131
2132 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2133 REGEX_CHECK_STATUS;
2134 UText input = UTEXT_INITIALIZER;
2135 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2136 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2137 // 012345678901234567
2138
2139 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2140 REGEX_CHECK_STATUS;
2141 REGEX_ASSERT(matcher->find());
2142 REGEX_ASSERT(matcher->start(status) == 1);
2143 REGEX_ASSERT(matcher->find());
2144 REGEX_ASSERT(matcher->start(status) == 6);
2145 REGEX_ASSERT(matcher->find());
2146 REGEX_ASSERT(matcher->start(status) == 12);
2147 REGEX_ASSERT(matcher->find() == FALSE);
2148 REGEX_ASSERT(matcher->find() == FALSE);
2149
2150 matcher->reset();
2151 REGEX_ASSERT(matcher->find());
2152 REGEX_ASSERT(matcher->start(status) == 1);
2153
2154 REGEX_ASSERT(matcher->find(0, status));
2155 REGEX_ASSERT(matcher->start(status) == 1);
2156 REGEX_ASSERT(matcher->find(1, status));
2157 REGEX_ASSERT(matcher->start(status) == 1);
2158 REGEX_ASSERT(matcher->find(2, status));
2159 REGEX_ASSERT(matcher->start(status) == 6);
2160 REGEX_ASSERT(matcher->find(12, status));
2161 REGEX_ASSERT(matcher->start(status) == 12);
2162 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2163 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2164 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2165 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2166
2167 status = U_ZERO_ERROR;
2168 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2169 status = U_ZERO_ERROR;
2170 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2171
2172 REGEX_ASSERT(matcher->groupCount() == 0);
2173
2174 delete matcher;
2175 delete pat;
2176
2177 utext_close(&input);
2178 utext_close(&re);
2179 }
2180
2181
2182 //
2183 // find, with \G in pattern (true if at the end of a previous match).
2184 //
2185 {
2186 int32_t flags=0;
2187 UParseError pe;
2188 UErrorCode status=U_ZERO_ERROR;
2189 UText re=UTEXT_INITIALIZER;
2190 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2191 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2192
2193 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2194
2195 REGEX_CHECK_STATUS;
2196 UText input = UTEXT_INITIALIZER;
2197 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2198 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2199 // 012345678901234567
2200
2201 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2202 REGEX_CHECK_STATUS;
2203 REGEX_ASSERT(matcher->find());
2204 REGEX_ASSERT(matcher->start(status) == 0);
2205 REGEX_ASSERT(matcher->start(1, status) == -1);
2206 REGEX_ASSERT(matcher->start(2, status) == 1);
2207
2208 REGEX_ASSERT(matcher->find());
2209 REGEX_ASSERT(matcher->start(status) == 4);
2210 REGEX_ASSERT(matcher->start(1, status) == 4);
2211 REGEX_ASSERT(matcher->start(2, status) == -1);
2212 REGEX_CHECK_STATUS;
2213
2214 delete matcher;
2215 delete pat;
2216
2217 utext_close(&input);
2218 utext_close(&re);
2219 }
2220
2221 //
2222 // find with zero length matches, match position should bump ahead
2223 // to prevent loops.
2224 //
2225 {
2226 int32_t i;
2227 UErrorCode status=U_ZERO_ERROR;
2228 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2229 // using an always-true look-ahead.
2230 REGEX_CHECK_STATUS;
2231 UText s = UTEXT_INITIALIZER;
2232 utext_openUTF8(&s, " ", -1, &status);
2233 m.reset(&s);
2234 for (i=0; ; i++) {
2235 if (m.find() == FALSE) {
2236 break;
2237 }
2238 REGEX_ASSERT(m.start(status) == i);
2239 REGEX_ASSERT(m.end(status) == i);
2240 }
2241 REGEX_ASSERT(i==5);
2242
2243 // Check that the bump goes over characters outside the BMP OK
2244 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2245 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2246 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2247 m.reset(&s);
2248 for (i=0; ; i+=4) {
2249 if (m.find() == FALSE) {
2250 break;
2251 }
2252 REGEX_ASSERT(m.start(status) == i);
2253 REGEX_ASSERT(m.end(status) == i);
2254 }
2255 REGEX_ASSERT(i==20);
2256
2257 utext_close(&s);
2258 }
2259 {
2260 // find() loop breaking test.
2261 // with pattern of /.?/, should see a series of one char matches, then a single
2262 // match of zero length at the end of the input string.
2263 int32_t i;
2264 UErrorCode status=U_ZERO_ERROR;
2265 RegexMatcher m(".?", 0, status);
2266 REGEX_CHECK_STATUS;
2267 UText s = UTEXT_INITIALIZER;
2268 utext_openUTF8(&s, " ", -1, &status);
2269 m.reset(&s);
2270 for (i=0; ; i++) {
2271 if (m.find() == FALSE) {
2272 break;
2273 }
2274 REGEX_ASSERT(m.start(status) == i);
2275 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2276 }
2277 REGEX_ASSERT(i==5);
2278
2279 utext_close(&s);
2280 }
2281
2282
2283 //
2284 // Matchers with no input string behave as if they had an empty input string.
2285 //
2286
2287 {
2288 UErrorCode status = U_ZERO_ERROR;
2289 RegexMatcher m(".?", 0, status);
2290 REGEX_CHECK_STATUS;
2291 REGEX_ASSERT(m.find());
2292 REGEX_ASSERT(m.start(status) == 0);
2293 REGEX_ASSERT(m.input() == "");
2294 }
2295 {
2296 UErrorCode status = U_ZERO_ERROR;
2297 RegexPattern *p = RegexPattern::compile(".", 0, status);
2298 RegexMatcher *m = p->matcher(status);
2299 REGEX_CHECK_STATUS;
2300
2301 REGEX_ASSERT(m->find() == FALSE);
2302 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2303 delete m;
2304 delete p;
2305 }
2306
2307 //
2308 // Regions
2309 //
2310 {
2311 UErrorCode status = U_ZERO_ERROR;
2312 UText testPattern = UTEXT_INITIALIZER;
2313 UText testText = UTEXT_INITIALIZER;
2314 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2315 REGEX_VERBOSE_TEXT(&testPattern);
2316 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2317 REGEX_VERBOSE_TEXT(&testText);
2318
2319 RegexMatcher m(&testPattern, &testText, 0, status);
2320 REGEX_CHECK_STATUS;
2321 REGEX_ASSERT(m.regionStart() == 0);
2322 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2323 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2324 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2325
2326 m.region(2,4, status);
2327 REGEX_CHECK_STATUS;
2328 REGEX_ASSERT(m.matches(status));
2329 REGEX_ASSERT(m.start(status)==2);
2330 REGEX_ASSERT(m.end(status)==4);
2331 REGEX_CHECK_STATUS;
2332
2333 m.reset();
2334 REGEX_ASSERT(m.regionStart() == 0);
2335 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2336
2337 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2338 REGEX_VERBOSE_TEXT(&testText);
2339 m.reset(&testText);
2340 REGEX_ASSERT(m.regionStart() == 0);
2341 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2342
2343 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2344 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2345 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2346 REGEX_ASSERT(&m == &m.reset());
2347 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2348
2349 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2350 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2351 REGEX_ASSERT(&m == &m.reset());
2352 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2353
2354 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2355 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2356 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2357 REGEX_ASSERT(&m == &m.reset());
2358 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2359
2360 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2361 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2362 REGEX_ASSERT(&m == &m.reset());
2363 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2364
2365 utext_close(&testText);
2366 utext_close(&testPattern);
2367 }
2368
2369 //
2370 // hitEnd() and requireEnd()
2371 //
2372 {
2373 UErrorCode status = U_ZERO_ERROR;
2374 UText testPattern = UTEXT_INITIALIZER;
2375 UText testText = UTEXT_INITIALIZER;
2376 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2377 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2378 utext_openUTF8(&testPattern, str_, -1, &status);
2379 utext_openUTF8(&testText, str_aabb, -1, &status);
2380
2381 RegexMatcher m1(&testPattern, &testText, 0, status);
2382 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2383 REGEX_ASSERT(m1.hitEnd() == TRUE);
2384 REGEX_ASSERT(m1.requireEnd() == FALSE);
2385 REGEX_CHECK_STATUS;
2386
2387 status = U_ZERO_ERROR;
2388 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2389 utext_openUTF8(&testPattern, str_a, -1, &status);
2390 RegexMatcher m2(&testPattern, &testText, 0, status);
2391 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2392 REGEX_ASSERT(m2.hitEnd() == FALSE);
2393 REGEX_ASSERT(m2.requireEnd() == FALSE);
2394 REGEX_CHECK_STATUS;
2395
2396 status = U_ZERO_ERROR;
2397 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2398 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2399 RegexMatcher m3(&testPattern, &testText, 0, status);
2400 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2401 REGEX_ASSERT(m3.hitEnd() == TRUE);
2402 REGEX_ASSERT(m3.requireEnd() == TRUE);
2403 REGEX_CHECK_STATUS;
2404
2405 utext_close(&testText);
2406 utext_close(&testPattern);
2407 }
2408}
2409
2410
2411//---------------------------------------------------------------------------
2412//
2413// API_Replace_UTF8 API test for class RegexMatcher, testing the
2414// Replace family of functions.
2415//
2416//---------------------------------------------------------------------------
2417void RegexTest::API_Replace_UTF8() {
2418 //
2419 // Replace
2420 //
2421 int32_t flags=0;
2422 UParseError pe;
2423 UErrorCode status=U_ZERO_ERROR;
2424
2425 UText re=UTEXT_INITIALIZER;
2426 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2427 REGEX_VERBOSE_TEXT(&re);
2428 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2429 REGEX_CHECK_STATUS;
2430
2431 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2432 // 012345678901234567
2433 UText dataText = UTEXT_INITIALIZER;
2434 utext_openUTF8(&dataText, data, -1, &status);
2435 REGEX_CHECK_STATUS;
2436 REGEX_VERBOSE_TEXT(&dataText);
2437 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2438
2439 //
2440 // Plain vanilla matches.
2441 //
2442 UnicodeString dest;
2443 UText destText = UTEXT_INITIALIZER;
2444 utext_openUnicodeString(&destText, &dest, &status);
2445 UText *result;
2446
2447 UText replText = UTEXT_INITIALIZER;
2448
2449 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2450 utext_openUTF8(&replText, str_yz, -1, &status);
2451 REGEX_VERBOSE_TEXT(&replText);
2452 result = matcher->replaceFirst(&replText, NULL, status);
2453 REGEX_CHECK_STATUS;
2454 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2455 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2456 utext_close(result);
2457 result = matcher->replaceFirst(&replText, &destText, status);
2458 REGEX_CHECK_STATUS;
2459 REGEX_ASSERT(result == &destText);
2460 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2461
2462 result = matcher->replaceAll(&replText, NULL, status);
2463 REGEX_CHECK_STATUS;
2464 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2465 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2466 utext_close(result);
2467
2468 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2469 result = matcher->replaceAll(&replText, &destText, status);
2470 REGEX_CHECK_STATUS;
2471 REGEX_ASSERT(result == &destText);
2472 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2473
2474 //
2475 // Plain vanilla non-matches.
2476 //
2477 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2478 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2479 matcher->reset(&dataText);
2480
2481 result = matcher->replaceFirst(&replText, NULL, status);
2482 REGEX_CHECK_STATUS;
2483 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2484 utext_close(result);
2485 result = matcher->replaceFirst(&replText, &destText, status);
2486 REGEX_CHECK_STATUS;
2487 REGEX_ASSERT(result == &destText);
2488 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2489
2490 result = matcher->replaceAll(&replText, NULL, status);
2491 REGEX_CHECK_STATUS;
2492 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2493 utext_close(result);
2494 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2495 result = matcher->replaceAll(&replText, &destText, status);
2496 REGEX_CHECK_STATUS;
2497 REGEX_ASSERT(result == &destText);
2498 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2499
2500 //
2501 // Empty source string
2502 //
2503 utext_openUTF8(&dataText, NULL, 0, &status);
2504 matcher->reset(&dataText);
2505
2506 result = matcher->replaceFirst(&replText, NULL, status);
2507 REGEX_CHECK_STATUS;
2508 REGEX_ASSERT_UTEXT_UTF8("", result);
2509 utext_close(result);
2510 result = matcher->replaceFirst(&replText, &destText, status);
2511 REGEX_CHECK_STATUS;
2512 REGEX_ASSERT(result == &destText);
2513 REGEX_ASSERT_UTEXT_UTF8("", result);
2514
2515 result = matcher->replaceAll(&replText, NULL, status);
2516 REGEX_CHECK_STATUS;
2517 REGEX_ASSERT_UTEXT_UTF8("", result);
2518 utext_close(result);
2519 result = matcher->replaceAll(&replText, &destText, status);
2520 REGEX_CHECK_STATUS;
2521 REGEX_ASSERT(result == &destText);
2522 REGEX_ASSERT_UTEXT_UTF8("", result);
2523
2524 //
2525 // Empty substitution string
2526 //
2527 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2528 matcher->reset(&dataText);
2529
2530 utext_openUTF8(&replText, NULL, 0, &status);
2531 result = matcher->replaceFirst(&replText, NULL, status);
2532 REGEX_CHECK_STATUS;
2533 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2534 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2535 utext_close(result);
2536 result = matcher->replaceFirst(&replText, &destText, status);
2537 REGEX_CHECK_STATUS;
2538 REGEX_ASSERT(result == &destText);
2539 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2540
2541 result = matcher->replaceAll(&replText, NULL, status);
2542 REGEX_CHECK_STATUS;
2543 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2544 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2545 utext_close(result);
2546 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2547 result = matcher->replaceAll(&replText, &destText, status);
2548 REGEX_CHECK_STATUS;
2549 REGEX_ASSERT(result == &destText);
2550 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2551
2552 //
2553 // match whole string
2554 //
2555 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2556 utext_openUTF8(&dataText, str_abc, -1, &status);
2557 matcher->reset(&dataText);
2558
2559 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2560 utext_openUTF8(&replText, str_xyz, -1, &status);
2561 result = matcher->replaceFirst(&replText, NULL, status);
2562 REGEX_CHECK_STATUS;
2563 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2564 utext_close(result);
2565 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2566 result = matcher->replaceFirst(&replText, &destText, status);
2567 REGEX_CHECK_STATUS;
2568 REGEX_ASSERT(result == &destText);
2569 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2570
2571 result = matcher->replaceAll(&replText, NULL, status);
2572 REGEX_CHECK_STATUS;
2573 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2574 utext_close(result);
2575 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2576 result = matcher->replaceAll(&replText, &destText, status);
2577 REGEX_CHECK_STATUS;
2578 REGEX_ASSERT(result == &destText);
2579 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2580
2581 //
2582 // Capture Group, simple case
2583 //
2584 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2585 utext_openUTF8(&re, str_add, -1, &status);
2586 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2587 REGEX_CHECK_STATUS;
2588
2589 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2590 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2591 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2592 REGEX_CHECK_STATUS;
2593
2594 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2595 utext_openUTF8(&replText, str_11, -1, &status);
2596 result = matcher2->replaceFirst(&replText, NULL, status);
2597 REGEX_CHECK_STATUS;
2598 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2599 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2600 utext_close(result);
2601 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2602 result = matcher2->replaceFirst(&replText, &destText, status);
2603 REGEX_CHECK_STATUS;
2604 REGEX_ASSERT(result == &destText);
2605 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2606
2607 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2608 utext_openUTF8(&replText, str_v, -1, &status);
2609 REGEX_VERBOSE_TEXT(&replText);
2610 result = matcher2->replaceFirst(&replText, NULL, status);
2611 REGEX_CHECK_STATUS;
2612 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2613 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2614 utext_close(result);
2615 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2616 result = matcher2->replaceFirst(&replText, &destText, status);
2617 REGEX_CHECK_STATUS;
2618 REGEX_ASSERT(result == &destText);
2619 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2620
2621 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2622 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2623 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2624 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2625 result = matcher2->replaceFirst(&replText, NULL, status);
2626 REGEX_CHECK_STATUS;
2627 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2628 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2629 utext_close(result);
2630 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2631 result = matcher2->replaceFirst(&replText, &destText, status);
2632 REGEX_CHECK_STATUS;
2633 REGEX_ASSERT(result == &destText);
2634 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2635
2636 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2637 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2638 // 012345678901234567890123456
2639 supplDigitChars[22] = 0xF0;
2640 supplDigitChars[23] = 0x9D;
2641 supplDigitChars[24] = 0x9F;
2642 supplDigitChars[25] = 0x8F;
2643 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2644
2645 result = matcher2->replaceFirst(&replText, NULL, status);
2646 REGEX_CHECK_STATUS;
2647 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2648 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2649 utext_close(result);
2650 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2651 result = matcher2->replaceFirst(&replText, &destText, status);
2652 REGEX_CHECK_STATUS;
2653 REGEX_ASSERT(result == &destText);
2654 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2655 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2656 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2657 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2658// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2659 utext_close(result);
2660 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2661 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2662 REGEX_ASSERT(result == &destText);
2663// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2664
2665 //
2666 // Replacement String with \u hex escapes
2667 //
2668 {
2669 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2670 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2671 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2672 utext_openUTF8(&replText, str_u0043, -1, &status);
2673 matcher->reset(&dataText);
2674
2675 result = matcher->replaceAll(&replText, NULL, status);
2676 REGEX_CHECK_STATUS;
2677 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2678 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2679 utext_close(result);
2680 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2681 result = matcher->replaceAll(&replText, &destText, status);
2682 REGEX_CHECK_STATUS;
2683 REGEX_ASSERT(result == &destText);
2684 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2685 }
2686 {
2687 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2688 utext_openUTF8(&dataText, str_abc, -1, &status);
2689 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2690 utext_openUTF8(&replText, str_U00010000, -1, &status);
2691 matcher->reset(&dataText);
2692
2693 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2694 // 0123456789
2695 expected[2] = 0xF0;
2696 expected[3] = 0x90;
2697 expected[4] = 0x80;
2698 expected[5] = 0x80;
2699
2700 result = matcher->replaceAll(&replText, NULL, status);
2701 REGEX_CHECK_STATUS;
2702 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2703 utext_close(result);
2704 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2705 result = matcher->replaceAll(&replText, &destText, status);
2706 REGEX_CHECK_STATUS;
2707 REGEX_ASSERT(result == &destText);
2708 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2709 }
2710 // TODO: need more through testing of capture substitutions.
2711
2712 // Bug 4057
2713 //
2714 {
2715 status = U_ZERO_ERROR;
2716const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2717const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2718const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2719 utext_openUTF8(&re, str_ssee, -1, &status);
2720 utext_openUTF8(&dataText, str_blah, -1, &status);
2721 utext_openUTF8(&replText, str_ooh, -1, &status);
2722
2723 RegexMatcher m(&re, 0, status);
2724 REGEX_CHECK_STATUS;
2725
2726 UnicodeString result;
2727 UText resultText = UTEXT_INITIALIZER;
2728 utext_openUnicodeString(&resultText, &result, &status);
2729
2730 // Multiple finds do NOT bump up the previous appendReplacement postion.
2731 m.reset(&dataText);
2732 m.find();
2733 m.find();
2734 m.appendReplacement(&resultText, &replText, status);
2735 REGEX_CHECK_STATUS;
2736 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2737 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2738
2739 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2740 status = U_ZERO_ERROR;
2741 result.truncate(0);
2742 utext_openUnicodeString(&resultText, &result, &status);
2743 m.reset(10, status);
2744 m.find();
2745 m.find();
2746 m.appendReplacement(&resultText, &replText, status);
2747 REGEX_CHECK_STATUS;
2748 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2749 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2750
2751 // find() at interior of string, appendReplacement still starts at beginning.
2752 status = U_ZERO_ERROR;
2753 result.truncate(0);
2754 utext_openUnicodeString(&resultText, &result, &status);
2755 m.reset();
2756 m.find(10, status);
2757 m.find();
2758 m.appendReplacement(&resultText, &replText, status);
2759 REGEX_CHECK_STATUS;
2760 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2762
2763 m.appendTail(&resultText, status);
2764 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2765 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2766
2767 utext_close(&resultText);
2768 }
2769
2770 delete matcher2;
2771 delete pat2;
2772 delete matcher;
2773 delete pat;
2774
2775 utext_close(&dataText);
2776 utext_close(&replText);
2777 utext_close(&destText);
2778 utext_close(&re);
2779}
2780
2781
2782//---------------------------------------------------------------------------
2783//
2784// API_Pattern_UTF8 Test that the API for class RegexPattern is
2785// present and nominally working.
2786//
2787//---------------------------------------------------------------------------
2788void RegexTest::API_Pattern_UTF8() {
2789 RegexPattern pata; // Test default constructor to not crash.
2790 RegexPattern patb;
2791
2792 REGEX_ASSERT(pata == patb);
2793 REGEX_ASSERT(pata == pata);
2794
2795 UText re1 = UTEXT_INITIALIZER;
2796 UText re2 = UTEXT_INITIALIZER;
2797 UErrorCode status = U_ZERO_ERROR;
2798 UParseError pe;
2799
2800 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2801 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2802 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2803 utext_openUTF8(&re2, str_def, -1, &status);
2804
2805 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2806 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2807 REGEX_CHECK_STATUS;
2808 REGEX_ASSERT(*pat1 == *pat1);
2809 REGEX_ASSERT(*pat1 != pata);
2810
2811 // Assign
2812 patb = *pat1;
2813 REGEX_ASSERT(patb == *pat1);
2814
2815 // Copy Construct
2816 RegexPattern patc(*pat1);
2817 REGEX_ASSERT(patc == *pat1);
2818 REGEX_ASSERT(patb == patc);
2819 REGEX_ASSERT(pat1 != pat2);
2820 patb = *pat2;
2821 REGEX_ASSERT(patb != patc);
2822 REGEX_ASSERT(patb == *pat2);
2823
2824 // Compile with no flags.
2825 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2826 REGEX_ASSERT(*pat1a == *pat1);
2827
2828 REGEX_ASSERT(pat1a->flags() == 0);
2829
2830 // Compile with different flags should be not equal
2831 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2832 REGEX_CHECK_STATUS;
2833
2834 REGEX_ASSERT(*pat1b != *pat1a);
2835 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2836 REGEX_ASSERT(pat1a->flags() == 0);
2837 delete pat1b;
2838
2839 // clone
2840 RegexPattern *pat1c = pat1->clone();
2841 REGEX_ASSERT(*pat1c == *pat1);
2842 REGEX_ASSERT(*pat1c != *pat2);
2843
2844 delete pat1c;
2845 delete pat1a;
2846 delete pat1;
2847 delete pat2;
2848
2849 utext_close(&re1);
2850 utext_close(&re2);
2851
2852
2853 //
2854 // Verify that a matcher created from a cloned pattern works.
2855 // (Jitterbug 3423)
2856 //
2857 {
2858 UErrorCode status = U_ZERO_ERROR;
2859 UText pattern = UTEXT_INITIALIZER;
2860 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2861 utext_openUTF8(&pattern, str_pL, -1, &status);
2862
2863 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2864 RegexPattern *pClone = pSource->clone();
2865 delete pSource;
2866 RegexMatcher *mFromClone = pClone->matcher(status);
2867 REGEX_CHECK_STATUS;
2868
2869 UText input = UTEXT_INITIALIZER;
2870 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2871 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2872 mFromClone->reset(&input);
2873 REGEX_ASSERT(mFromClone->find() == TRUE);
2874 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2875 REGEX_ASSERT(mFromClone->find() == TRUE);
2876 REGEX_ASSERT(mFromClone->group(status) == "World");
2877 REGEX_ASSERT(mFromClone->find() == FALSE);
2878 delete mFromClone;
2879 delete pClone;
2880
2881 utext_close(&input);
2882 utext_close(&pattern);
2883 }
2884
2885 //
2886 // matches convenience API
2887 //
2888 {
2889 UErrorCode status = U_ZERO_ERROR;
2890 UText pattern = UTEXT_INITIALIZER;
2891 UText input = UTEXT_INITIALIZER;
2892
2893 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2894 utext_openUTF8(&input, str_randominput, -1, &status);
2895
2896 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2897 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2898 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2899 REGEX_CHECK_STATUS;
2900
2901 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2902 utext_openUTF8(&pattern, str_abc, -1, &status);
2903 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2904 REGEX_CHECK_STATUS;
2905
2906 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2907 utext_openUTF8(&pattern, str_nput, -1, &status);
2908 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2909 REGEX_CHECK_STATUS;
2910
2911 utext_openUTF8(&pattern, str_randominput, -1, &status);
2912 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2913 REGEX_CHECK_STATUS;
2914
2915 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2916 utext_openUTF8(&pattern, str_u, -1, &status);
2917 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2918 REGEX_CHECK_STATUS;
2919
2920 utext_openUTF8(&input, str_abc, -1, &status);
2921 utext_openUTF8(&pattern, str_abc, -1, &status);
2922 status = U_INDEX_OUTOFBOUNDS_ERROR;
2923 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2924 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2925
2926 utext_close(&input);
2927 utext_close(&pattern);
2928 }
2929
2930
2931 //
2932 // Split()
2933 //
2934 status = U_ZERO_ERROR;
2935 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2936 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2937 pat1 = RegexPattern::compile(&re1, pe, status);
2938 REGEX_CHECK_STATUS;
2939 UnicodeString fields[10];
2940
2941 int32_t n;
2942 n = pat1->split("Now is the time", fields, 10, status);
2943 REGEX_CHECK_STATUS;
2944 REGEX_ASSERT(n==4);
2945 REGEX_ASSERT(fields[0]=="Now");
2946 REGEX_ASSERT(fields[1]=="is");
2947 REGEX_ASSERT(fields[2]=="the");
2948 REGEX_ASSERT(fields[3]=="time");
2949 REGEX_ASSERT(fields[4]=="");
2950
2951 n = pat1->split("Now is the time", fields, 2, status);
2952 REGEX_CHECK_STATUS;
2953 REGEX_ASSERT(n==2);
2954 REGEX_ASSERT(fields[0]=="Now");
2955 REGEX_ASSERT(fields[1]=="is the time");
2956 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2957
2958 fields[1] = "*";
2959 status = U_ZERO_ERROR;
2960 n = pat1->split("Now is the time", fields, 1, status);
2961 REGEX_CHECK_STATUS;
2962 REGEX_ASSERT(n==1);
2963 REGEX_ASSERT(fields[0]=="Now is the time");
2964 REGEX_ASSERT(fields[1]=="*");
2965 status = U_ZERO_ERROR;
2966
2967 n = pat1->split(" Now is the time ", fields, 10, status);
2968 REGEX_CHECK_STATUS;
2969 REGEX_ASSERT(n==6);
2970 REGEX_ASSERT(fields[0]=="");
2971 REGEX_ASSERT(fields[1]=="Now");
2972 REGEX_ASSERT(fields[2]=="is");
2973 REGEX_ASSERT(fields[3]=="the");
2974 REGEX_ASSERT(fields[4]=="time");
2975 REGEX_ASSERT(fields[5]=="");
2976 REGEX_ASSERT(fields[6]=="");
2977
2978 fields[2] = "*";
2979 n = pat1->split(" ", fields, 10, status);
2980 REGEX_CHECK_STATUS;
2981 REGEX_ASSERT(n==2);
2982 REGEX_ASSERT(fields[0]=="");
2983 REGEX_ASSERT(fields[1]=="");
2984 REGEX_ASSERT(fields[2]=="*");
2985
2986 fields[0] = "foo";
2987 n = pat1->split("", fields, 10, status);
2988 REGEX_CHECK_STATUS;
2989 REGEX_ASSERT(n==0);
2990 REGEX_ASSERT(fields[0]=="foo");
2991
2992 delete pat1;
2993
2994 // split, with a pattern with (capture)
2995 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2996 pat1 = RegexPattern::compile(&re1, pe, status);
2997 REGEX_CHECK_STATUS;
2998
2999 status = U_ZERO_ERROR;
3000 fields[6] = fields[7] = "*";
3001 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3002 REGEX_CHECK_STATUS;
3003 REGEX_ASSERT(n==7);
3004 REGEX_ASSERT(fields[0]=="");
3005 REGEX_ASSERT(fields[1]=="a");
3006 REGEX_ASSERT(fields[2]=="Now is ");
3007 REGEX_ASSERT(fields[3]=="b");
3008 REGEX_ASSERT(fields[4]=="the time");
3009 REGEX_ASSERT(fields[5]=="c");
3010 REGEX_ASSERT(fields[6]=="");
3011 REGEX_ASSERT(fields[7]=="*");
3012 REGEX_ASSERT(status==U_ZERO_ERROR);
3013
3014 fields[6] = fields[7] = "*";
3015 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3016 REGEX_CHECK_STATUS;
3017 REGEX_ASSERT(n==7);
3018 REGEX_ASSERT(fields[0]==" ");
3019 REGEX_ASSERT(fields[1]=="a");
3020 REGEX_ASSERT(fields[2]=="Now is ");
3021 REGEX_ASSERT(fields[3]=="b");
3022 REGEX_ASSERT(fields[4]=="the time");
3023 REGEX_ASSERT(fields[5]=="c");
3024 REGEX_ASSERT(fields[6]=="");
3025 REGEX_ASSERT(fields[7]=="*");
3026
3027 status = U_ZERO_ERROR;
3028 fields[6] = "foo";
3029 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3030 REGEX_CHECK_STATUS;
3031 REGEX_ASSERT(n==6);
3032 REGEX_ASSERT(fields[0]==" ");
3033 REGEX_ASSERT(fields[1]=="a");
3034 REGEX_ASSERT(fields[2]=="Now is ");
3035 REGEX_ASSERT(fields[3]=="b");
3036 REGEX_ASSERT(fields[4]=="the time");
3037 REGEX_ASSERT(fields[5]==" ");
3038 REGEX_ASSERT(fields[6]=="foo");
3039
3040 status = U_ZERO_ERROR;
3041 fields[5] = "foo";
3042 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3043 REGEX_CHECK_STATUS;
3044 REGEX_ASSERT(n==5);
3045 REGEX_ASSERT(fields[0]==" ");
3046 REGEX_ASSERT(fields[1]=="a");
3047 REGEX_ASSERT(fields[2]=="Now is ");
3048 REGEX_ASSERT(fields[3]=="b");
3049 REGEX_ASSERT(fields[4]=="the time<c>");
3050 REGEX_ASSERT(fields[5]=="foo");
3051
3052 status = U_ZERO_ERROR;
3053 fields[5] = "foo";
3054 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3055 REGEX_CHECK_STATUS;
3056 REGEX_ASSERT(n==5);
3057 REGEX_ASSERT(fields[0]==" ");
3058 REGEX_ASSERT(fields[1]=="a");
3059 REGEX_ASSERT(fields[2]=="Now is ");
3060 REGEX_ASSERT(fields[3]=="b");
3061 REGEX_ASSERT(fields[4]=="the time");
3062 REGEX_ASSERT(fields[5]=="foo");
3063
3064 status = U_ZERO_ERROR;
3065 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3066 REGEX_CHECK_STATUS;
3067 REGEX_ASSERT(n==4);
3068 REGEX_ASSERT(fields[0]==" ");
3069 REGEX_ASSERT(fields[1]=="a");
3070 REGEX_ASSERT(fields[2]=="Now is ");
3071 REGEX_ASSERT(fields[3]=="the time<c>");
3072 status = U_ZERO_ERROR;
3073 delete pat1;
3074
3075 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3076 pat1 = RegexPattern::compile(&re1, pe, status);
3077 REGEX_CHECK_STATUS;
3078 n = pat1->split("1-10,20", fields, 10, status);
3079 REGEX_CHECK_STATUS;
3080 REGEX_ASSERT(n==5);
3081 REGEX_ASSERT(fields[0]=="1");
3082 REGEX_ASSERT(fields[1]=="-");
3083 REGEX_ASSERT(fields[2]=="10");
3084 REGEX_ASSERT(fields[3]==",");
3085 REGEX_ASSERT(fields[4]=="20");
3086 delete pat1;
3087
3088
3089 //
3090 // split of a UText based string, with library allocating output UTexts.
3091 //
3092 {
3093 status = U_ZERO_ERROR;
3094 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3095 UnicodeString stringToSplit("first:second:third");
3096 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3097 REGEX_CHECK_STATUS;
3098
3099 UText *splits[10] = {NULL};
3100 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3101 REGEX_CHECK_STATUS;
3102 REGEX_ASSERT(numFields == 5);
3103 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3104 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3105 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3106 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3107 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3108 REGEX_ASSERT(splits[5] == NULL);
3109
3110 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3111 if (splits[i]) {
3112 utext_close(splits[i]);
3113 splits[i] = NULL;
3114 }
3115 }
3116 utext_close(textToSplit);
3117 }
3118
3119
3120 //
3121 // RegexPattern::pattern() and patternText()
3122 //
3123 pat1 = new RegexPattern();
3124 REGEX_ASSERT(pat1->pattern() == "");
3125 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3126 delete pat1;
3127 const char *helloWorldInvariant = "(Hello, world)*";
3128 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3129 pat1 = RegexPattern::compile(&re1, pe, status);
3130 REGEX_CHECK_STATUS;
3131 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3132 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3133 delete pat1;
3134
3135 utext_close(&re1);
3136}
3137
3138
3139//---------------------------------------------------------------------------
3140//
3141// Extended A more thorough check for features of regex patterns
3142// The test cases are in a separate data file,
3143// source/tests/testdata/regextst.txt
3144// A description of the test data format is included in that file.
3145//
3146//---------------------------------------------------------------------------
3147
3148const char *
3149RegexTest::getPath(char buffer[2048], const char *filename) {
3150 UErrorCode status=U_ZERO_ERROR;
3151 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3152 if (U_FAILURE(status)) {
3153 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3154 return NULL;
3155 }
3156
3157 strcpy(buffer, testDataDirectory);
3158 strcat(buffer, filename);
3159 return buffer;
3160}
3161
3162void RegexTest::Extended() {
3163 char tdd[2048];
3164 const char *srcPath;
3165 UErrorCode status = U_ZERO_ERROR;
3166 int32_t lineNum = 0;
3167
3168 //
3169 // Open and read the test data file.
3170 //
3171 srcPath=getPath(tdd, "regextst.txt");
3172 if(srcPath==NULL) {
3173 return; /* something went wrong, error already output */
3174 }
3175
3176 int32_t len;
3177 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3178 if (U_FAILURE(status)) {
3179 return; /* something went wrong, error already output */
3180 }
3181
3182 //
3183 // Put the test data into a UnicodeString
3184 //
3185 UnicodeString testString(FALSE, testData, len);
3186
3187 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3188 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3189 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3190
3191 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3192 UnicodeString testPattern; // The pattern for test from the test file.
3193 UnicodeString testFlags; // the flags for a test.
3194 UnicodeString matchString; // The marked up string to be used as input
3195
3196 if (U_FAILURE(status)){
3197 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3198 delete [] testData;
3199 return;
3200 }
3201
3202 //
3203 // Loop over the test data file, once per line.
3204 //
3205 while (lineMat.find()) {
3206 lineNum++;
3207 if (U_FAILURE(status)) {
3208 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3209 }
3210
3211 status = U_ZERO_ERROR;
3212 UnicodeString testLine = lineMat.group(1, status);
3213 if (testLine.length() == 0) {
3214 continue;
3215 }
3216
3217 //
3218 // Parse the test line. Skip blank and comment only lines.
3219 // Separate out the three main fields - pattern, flags, target.
3220 //
3221
3222 commentMat.reset(testLine);
3223 if (commentMat.lookingAt(status)) {
3224 // This line is a comment, or blank.
3225 continue;
3226 }
3227
3228 //
3229 // Pull out the pattern field, remove it from the test file line.
3230 //
3231 quotedStuffMat.reset(testLine);
3232 if (quotedStuffMat.lookingAt(status)) {
3233 testPattern = quotedStuffMat.group(2, status);
3234 testLine.remove(0, quotedStuffMat.end(0, status));
3235 } else {
3236 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3237 continue;
3238 }
3239
3240
3241 //
3242 // Pull out the flags from the test file line.
3243 //
3244 flagsMat.reset(testLine);
3245 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3246 testFlags = flagsMat.group(1, status);
3247 if (flagsMat.group(2, status).length() > 0) {
3248 errln("Bad Match flag at line %d. Scanning %c\n",
3249 lineNum, flagsMat.group(2, status).charAt(0));
3250 continue;
3251 }
3252 testLine.remove(0, flagsMat.end(0, status));
3253
3254 //
3255 // Pull out the match string, as a whole.
3256 // We'll process the <tags> later.
3257 //
3258 quotedStuffMat.reset(testLine);
3259 if (quotedStuffMat.lookingAt(status)) {
3260 matchString = quotedStuffMat.group(2, status);
3261 testLine.remove(0, quotedStuffMat.end(0, status));
3262 } else {
3263 errln("Bad match string at test file line %d", lineNum);
3264 continue;
3265 }
3266
3267 //
3268 // The only thing left from the input line should be an optional trailing comment.
3269 //
3270 commentMat.reset(testLine);
3271 if (commentMat.lookingAt(status) == FALSE) {
3272 errln("Line %d: unexpected characters at end of test line.", lineNum);
3273 continue;
3274 }
3275
3276 //
3277 // Run the test
3278 //
3279 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3280 }
3281
3282 delete [] testData;
3283
3284}
3285
3286
3287
3288//---------------------------------------------------------------------------
3289//
3290// regex_find(pattern, flags, inputString, lineNumber)
3291//
3292// Function to run a single test from the Extended (data driven) tests.
3293// See file test/testdata/regextst.txt for a description of the
3294// pattern and inputString fields, and the allowed flags.
3295// lineNumber is the source line in regextst.txt of the test.
3296//
3297//---------------------------------------------------------------------------
3298
3299
3300// Set a value into a UVector at position specified by a decimal number in
3301// a UnicodeString. This is a utility function needed by the actual test function,
3302// which follows.
3303static void set(UVector &vec, int32_t val, UnicodeString index) {
3304 UErrorCode status=U_ZERO_ERROR;
3305 int32_t idx = 0;
3306 for (int32_t i=0; i<index.length(); i++) {
3307 int32_t d=u_charDigitValue(index.charAt(i));
3308 if (d<0) {return;}
3309 idx = idx*10 + d;
3310 }
3311 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3312 vec.setElementAt(val, idx);
3313}
3314
3315static void setInt(UVector &vec, int32_t val, int32_t idx) {
3316 UErrorCode status=U_ZERO_ERROR;
3317 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3318 vec.setElementAt(val, idx);
3319}
3320
3321static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3322{
3323 UBool couldFind = TRUE;
3324 UTEXT_SETNATIVEINDEX(utext, 0);
3325 int32_t i = 0;
3326 while (i < unistrOffset) {
3327 UChar32 c = UTEXT_NEXT32(utext);
3328 if (c != U_SENTINEL) {
3329 i += U16_LENGTH(c);
3330 } else {
3331 couldFind = FALSE;
3332 break;
3333 }
3334 }
3335 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3336 return couldFind;
3337}
3338
3339
3340void RegexTest::regex_find(const UnicodeString &pattern,
3341 const UnicodeString &flags,
3342 const UnicodeString &inputString,
3343 const char *srcPath,
3344 int32_t line) {
3345 UnicodeString unEscapedInput;
3346 UnicodeString deTaggedInput;
3347
3348 int32_t patternUTF8Length, inputUTF8Length;
3349 char *patternChars = NULL, *inputChars = NULL;
3350 UText patternText = UTEXT_INITIALIZER;
3351 UText inputText = UTEXT_INITIALIZER;
3352 UConverter *UTF8Converter = NULL;
3353
3354 UErrorCode status = U_ZERO_ERROR;
3355 UParseError pe;
3356 RegexPattern *parsePat = NULL;
3357 RegexMatcher *parseMatcher = NULL;
3358 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3359 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3360 UVector groupStarts(status);
3361 UVector groupEnds(status);
3362 UVector groupStartsUTF8(status);
3363 UVector groupEndsUTF8(status);
3364 UBool isMatch = FALSE, isUTF8Match = FALSE;
3365 UBool failed = FALSE;
3366 int32_t numFinds;
3367 int32_t i;
3368 UBool useMatchesFunc = FALSE;
3369 UBool useLookingAtFunc = FALSE;
3370 int32_t regionStart = -1;
3371 int32_t regionEnd = -1;
3372 int32_t regionStartUTF8 = -1;
3373 int32_t regionEndUTF8 = -1;
3374
3375
3376 //
3377 // Compile the caller's pattern
3378 //
3379 uint32_t bflags = 0;
3380 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3381 bflags |= UREGEX_CASE_INSENSITIVE;
3382 }
3383 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3384 bflags |= UREGEX_COMMENTS;
3385 }
3386 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3387 bflags |= UREGEX_DOTALL;
3388 }
3389 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3390 bflags |= UREGEX_MULTILINE;
3391 }
3392
3393 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3394 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3395 }
3396 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3397 bflags |= UREGEX_UNIX_LINES;
3398 }
3399 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3400 bflags |= UREGEX_LITERAL;
3401 }
3402
3403
3404 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3405 if (status != U_ZERO_ERROR) {
3406 #if UCONFIG_NO_BREAK_ITERATION==1
3407 // 'v' test flag means that the test pattern should not compile if ICU was configured
3408 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3409 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3410 goto cleanupAndReturn;
3411 }
3412 #endif
3413 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3414 // Expected pattern compilation error.
3415 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3416 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3417 }
3418 goto cleanupAndReturn;
3419 } else {
3420 // Unexpected pattern compilation error.
3421 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3422 goto cleanupAndReturn;
3423 }
3424 }
3425
3426 UTF8Converter = ucnv_open("UTF8", &status);
3427 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3428
3429 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3430 status = U_ZERO_ERROR; // buffer overflow
3431 patternChars = new char[patternUTF8Length+1];
3432 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3433 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3434
3435 if (status == U_ZERO_ERROR) {
3436 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3437
3438 if (status != U_ZERO_ERROR) {
3439#if UCONFIG_NO_BREAK_ITERATION==1
3440 // 'v' test flag means that the test pattern should not compile if ICU was configured
3441 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3442 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3443 goto cleanupAndReturn;
3444 }
3445#endif
3446 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3447 // Expected pattern compilation error.
3448 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3449 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3450 }
3451 goto cleanupAndReturn;
3452 } else {
3453 // Unexpected pattern compilation error.
3454 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3455 goto cleanupAndReturn;
3456 }
3457 }
3458 }
3459
3460 if (UTF8Pattern == NULL) {
3461 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3462 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3463 status = U_ZERO_ERROR;
3464 }
3465
3466 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3467 callerPattern->dumpPattern();
3468 }
3469
3470 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3471 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3472 goto cleanupAndReturn;
3473 }
3474
3475
3476 //
3477 // Number of times find() should be called on the test string, default to 1
3478 //
3479 numFinds = 1;
3480 for (i=2; i<=9; i++) {
3481 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3482 if (numFinds != 1) {
3483 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3484 goto cleanupAndReturn;
3485 }
3486 numFinds = i;
3487 }
3488 }
3489
3490 // 'M' flag. Use matches() instead of find()
3491 if (flags.indexOf((UChar)0x4d) >= 0) {
3492 useMatchesFunc = TRUE;
3493 }
3494 if (flags.indexOf((UChar)0x4c) >= 0) {
3495 useLookingAtFunc = TRUE;
3496 }
3497
3498 //
3499 // Find the tags in the input data, remove them, and record the group boundary
3500 // positions.
3501 //
3502 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3503 REGEX_CHECK_STATUS_L(line);
3504
3505 unEscapedInput = inputString.unescape();
3506 parseMatcher = parsePat->matcher(unEscapedInput, status);
3507 REGEX_CHECK_STATUS_L(line);
3508 while(parseMatcher->find()) {
3509 parseMatcher->appendReplacement(deTaggedInput, "", status);
3510 REGEX_CHECK_STATUS;
3511 UnicodeString groupNum = parseMatcher->group(2, status);
3512 if (groupNum == "r") {
3513 // <r> or </r>, a region specification within the string
3514 if (parseMatcher->group(1, status) == "/") {
3515 regionEnd = deTaggedInput.length();
3516 } else {
3517 regionStart = deTaggedInput.length();
3518 }
3519 } else {
3520 // <digits> or </digits>, a group match boundary tag.
3521 if (parseMatcher->group(1, status) == "/") {
3522 set(groupEnds, deTaggedInput.length(), groupNum);
3523 } else {
3524 set(groupStarts, deTaggedInput.length(), groupNum);
3525 }
3526 }
3527 }
3528 parseMatcher->appendTail(deTaggedInput);
3529
3530 if (groupStarts.size() != groupEnds.size()) {
3531 errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3532 failed = true;
3533 goto cleanupAndReturn;
3534 }
3535 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3536 errln("mismatched <r> tags");
3537 failed = TRUE;
3538 goto cleanupAndReturn;
3539 }
3540
3541 //
3542 // Configure the matcher according to the flags specified with this test.
3543 //
3544 matcher = callerPattern->matcher(deTaggedInput, status);
3545 REGEX_CHECK_STATUS_L(line);
3546 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3547 matcher->setTrace(TRUE);
3548 }
3549
3550 if (UTF8Pattern != NULL) {
3551 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3552 status = U_ZERO_ERROR; // buffer overflow
3553 inputChars = new char[inputUTF8Length+1];
3554 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3555 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3556
3557 if (status == U_ZERO_ERROR) {
3558 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3559 REGEX_CHECK_STATUS_L(line);
3560 }
3561
3562 if (UTF8Matcher == NULL) {
3563 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3564 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3565 status = U_ZERO_ERROR;
3566 }
3567 }
3568
3569 //
3570 // Generate native indices for UTF8 versions of region and capture group info
3571 //
3572 if (UTF8Matcher != NULL) {
3573 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3574 UTF8Matcher->setTrace(TRUE);
3575 }
3576 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3577 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3578
3579 // Fill out the native index UVector info.
3580 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3581 for (i=0; i<groupStarts.size(); i++) {
3582 int32_t start = groupStarts.elementAti(i);
3583 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3584 if (start >= 0) {
3585 int32_t startUTF8;
3586 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3587 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3588 failed = TRUE;
3589 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3590 }
3591 setInt(groupStartsUTF8, startUTF8, i);
3592 }
3593
3594 int32_t end = groupEnds.elementAti(i);
3595 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3596 if (end >= 0) {
3597 int32_t endUTF8;
3598 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3599 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3600 failed = TRUE;
3601 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3602 }
3603 setInt(groupEndsUTF8, endUTF8, i);
3604 }
3605 }
3606 }
3607
3608 if (regionStart>=0) {
3609 matcher->region(regionStart, regionEnd, status);
3610 REGEX_CHECK_STATUS_L(line);
3611 if (UTF8Matcher != NULL) {
3612 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3613 REGEX_CHECK_STATUS_L(line);
3614 }
3615 }
3616 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3617 matcher->useAnchoringBounds(FALSE);
3618 if (UTF8Matcher != NULL) {
3619 UTF8Matcher->useAnchoringBounds(FALSE);
3620 }
3621 }
3622 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3623 matcher->useTransparentBounds(TRUE);
3624 if (UTF8Matcher != NULL) {
3625 UTF8Matcher->useTransparentBounds(TRUE);
3626 }
3627 }
3628
3629
3630
3631 //
3632 // Do a find on the de-tagged input using the caller's pattern
3633 // TODO: error on count>1 and not find().
3634 // error on both matches() and lookingAt().
3635 //
3636 for (i=0; i<numFinds; i++) {
3637 if (useMatchesFunc) {
3638 isMatch = matcher->matches(status);
3639 if (UTF8Matcher != NULL) {
3640 isUTF8Match = UTF8Matcher->matches(status);
3641 }
3642 } else if (useLookingAtFunc) {
3643 isMatch = matcher->lookingAt(status);
3644 if (UTF8Matcher != NULL) {
3645 isUTF8Match = UTF8Matcher->lookingAt(status);
3646 }
3647 } else {
3648 isMatch = matcher->find();
3649 if (UTF8Matcher != NULL) {
3650 isUTF8Match = UTF8Matcher->find();
3651 }
3652 }
3653 }
3654 matcher->setTrace(FALSE);
3655 if (UTF8Matcher) {
3656 UTF8Matcher->setTrace(FALSE);
3657 }
3658 if (U_FAILURE(status)) {
3659 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3660 }
3661
3662 //
3663 // Match up the groups from the find() with the groups from the tags
3664 //
3665
3666 // number of tags should match number of groups from find operation.
3667 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3668 // G option in test means that capture group data is not available in the
3669 // expected results, so the check needs to be suppressed.
3670 if (isMatch == FALSE && groupStarts.size() != 0) {
3671 dataerrln("Error at line %d: Match expected, but none found.", line);
3672 failed = TRUE;
3673 goto cleanupAndReturn;
3674 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3675 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3676 failed = TRUE;
3677 goto cleanupAndReturn;
3678 }
3679 if (isMatch && groupStarts.size() == 0) {
3680 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3681 failed = TRUE;
3682 }
3683 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3684 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3685 failed = TRUE;
3686 }
3687
3688 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3689 // Only check for match / no match. Don't check capture groups.
3690 goto cleanupAndReturn;
3691 }
3692
3693 REGEX_CHECK_STATUS_L(line);
3694 for (i=0; i<=matcher->groupCount(); i++) {
3695 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3696 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3697 if (matcher->start(i, status) != expectedStart) {
3698 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3699 line, i, expectedStart, matcher->start(i, status));
3700 failed = TRUE;
3701 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3702 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3703 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3704 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3705 failed = TRUE;
3706 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3707 }
3708
3709 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3710 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3711 if (matcher->end(i, status) != expectedEnd) {
3712 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3713 line, i, expectedEnd, matcher->end(i, status));
3714 failed = TRUE;
3715 // Error on end position; keep going; real error is probably yet to come as group
3716 // end positions work from end of the input data towards the front.
3717 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3718 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3719 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3720 failed = TRUE;
3721 // Error on end position; keep going; real error is probably yet to come as group
3722 // end positions work from end of the input data towards the front.
3723 }
3724 }
3725 if ( matcher->groupCount()+1 < groupStarts.size()) {
3726 errln("Error at line %d: Expected %d capture groups, found %d.",
3727 line, groupStarts.size()-1, matcher->groupCount());
3728 failed = TRUE;
3729 }
3730 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3731 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3732 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3733 failed = TRUE;
3734 }
3735
3736 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3737 matcher->requireEnd() == TRUE) {
3738 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3739 failed = TRUE;
3740 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3741 UTF8Matcher->requireEnd() == TRUE) {
3742 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3743 failed = TRUE;
3744 }
3745
3746 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3747 matcher->requireEnd() == FALSE) {
3748 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3749 failed = TRUE;
3750 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3751 UTF8Matcher->requireEnd() == FALSE) {
3752 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3753 failed = TRUE;
3754 }
3755
3756 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3757 matcher->hitEnd() == TRUE) {
3758 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3759 failed = TRUE;
3760 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3761 UTF8Matcher->hitEnd() == TRUE) {
3762 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3763 failed = TRUE;
3764 }
3765
3766 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3767 matcher->hitEnd() == FALSE) {
3768 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3769 failed = TRUE;
3770 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3771 UTF8Matcher->hitEnd() == FALSE) {
3772 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3773 failed = TRUE;
3774 }
3775
3776
3777cleanupAndReturn:
3778 if (failed) {
3779 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3780 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3781 // callerPattern->dump();
3782 }
3783 delete parseMatcher;
3784 delete parsePat;
3785 delete UTF8Matcher;
3786 delete UTF8Pattern;
3787 delete matcher;
3788 delete callerPattern;
3789
3790 utext_close(&inputText);
3791 delete[] inputChars;
3792 utext_close(&patternText);
3793 delete[] patternChars;
3794 ucnv_close(UTF8Converter);
3795}
3796
3797
3798
3799
3800//---------------------------------------------------------------------------
3801//
3802// Errors Check for error handling in patterns.
3803//
3804//---------------------------------------------------------------------------
3805void RegexTest::Errors() {
3806 // \escape sequences that aren't implemented yet.
3807 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3808
3809 // Missing close parentheses
3810 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3811 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3812 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3813
3814 // Extra close paren
3815 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3816 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3817 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3818
3819 // Look-ahead, Look-behind
3820 // TODO: add tests for unbounded length look-behinds.
3821 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3822
3823 // Attempt to use non-default flags
3824 {
3825 UParseError pe;
3826 UErrorCode status = U_ZERO_ERROR;
3827 int32_t flags = UREGEX_CANON_EQ |
3828 UREGEX_COMMENTS | UREGEX_DOTALL |
3829 UREGEX_MULTILINE;
3830 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3831 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3832 delete pat1;
3833 }
3834
3835
3836 // Quantifiers are allowed only after something that can be quantified.
3837 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3838 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3839 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3840
3841 // Mal-formed {min,max} quantifiers
3842 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3843 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3844 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3845 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3846 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3847 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3848 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3849 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3850 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3851
3852 // Ticket 5389
3853 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3854
3855 // Invalid Back Reference \0
3856 // For ICU 3.8 and earlier
3857 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3858 //
3859 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3860
3861}
3862
3863
3864//-------------------------------------------------------------------------------
3865//
3866// Read a text data file, convert it to UChars, and return the data
3867// in one big UChar * buffer, which the caller must delete.
3868//
3869//--------------------------------------------------------------------------------
3870UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3871 const char *defEncoding, UErrorCode &status) {
3872 UChar *retPtr = NULL;
3873 char *fileBuf = NULL;
3874 UConverter* conv = NULL;
3875 FILE *f = NULL;
3876
3877 ulen = 0;
3878 if (U_FAILURE(status)) {
3879 return retPtr;
3880 }
3881
3882 //
3883 // Open the file.
3884 //
3885 f = fopen(fileName, "rb");
3886 if (f == 0) {
3887 dataerrln("Error opening test data file %s\n", fileName);
3888 status = U_FILE_ACCESS_ERROR;
3889 return NULL;
3890 }
3891 //
3892 // Read it in
3893 //
3894 int32_t fileSize;
3895 int32_t amt_read;
3896
3897 fseek( f, 0, SEEK_END);
3898 fileSize = ftell(f);
3899 fileBuf = new char[fileSize];
3900 fseek(f, 0, SEEK_SET);
3901 amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3902 if (amt_read != fileSize || fileSize <= 0) {
3903 errln("Error reading test data file.");
3904 goto cleanUpAndReturn;
3905 }
3906
3907 //
3908 // Look for a Unicode Signature (BOM) on the data just read
3909 //
3910 int32_t signatureLength;
3911 const char * fileBufC;
3912 const char* encoding;
3913
3914 fileBufC = fileBuf;
3915 encoding = ucnv_detectUnicodeSignature(
3916 fileBuf, fileSize, &signatureLength, &status);
3917 if(encoding!=NULL ){
3918 fileBufC += signatureLength;
3919 fileSize -= signatureLength;
3920 } else {
3921 encoding = defEncoding;
3922 if (strcmp(encoding, "utf-8") == 0) {
3923 errln("file %s is missing its BOM", fileName);
3924 }
3925 }
3926
3927 //
3928 // Open a converter to take the rule file to UTF-16
3929 //
3930 conv = ucnv_open(encoding, &status);
3931 if (U_FAILURE(status)) {
3932 goto cleanUpAndReturn;
3933 }
3934
3935 //
3936 // Convert the rules to UChar.
3937 // Preflight first to determine required buffer size.
3938 //
3939 ulen = ucnv_toUChars(conv,
3940 NULL, // dest,
3941 0, // destCapacity,
3942 fileBufC,
3943 fileSize,
3944 &status);
3945 if (status == U_BUFFER_OVERFLOW_ERROR) {
3946 // Buffer Overflow is expected from the preflight operation.
3947 status = U_ZERO_ERROR;
3948
3949 retPtr = new UChar[ulen+1];
3950 ucnv_toUChars(conv,
3951 retPtr, // dest,
3952 ulen+1,
3953 fileBufC,
3954 fileSize,
3955 &status);
3956 }
3957
3958cleanUpAndReturn:
3959 fclose(f);
3960 delete[] fileBuf;
3961 ucnv_close(conv);
3962 if (U_FAILURE(status)) {
3963 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3964 delete []retPtr;
3965 retPtr = 0;
3966 ulen = 0;
3967 }
3968 return retPtr;
3969}
3970
3971
3972//-------------------------------------------------------------------------------
3973//
3974// PerlTests - Run Perl's regular expression tests
3975// The input file for this test is re_tests, the standard regular
3976// expression test data distributed with the Perl source code.
3977//
3978// Here is Perl's description of the test data file:
3979//
3980// # The tests are in a separate file 't/op/re_tests'.
3981// # Each line in that file is a separate test.
3982// # There are five columns, separated by tabs.
3983// #
3984// # Column 1 contains the pattern, optionally enclosed in C<''>.
3985// # Modifiers can be put after the closing C<'>.
3986// #
3987// # Column 2 contains the string to be matched.
3988// #
3989// # Column 3 contains the expected result:
3990// # y expect a match
3991// # n expect no match
3992// # c expect an error
3993// # B test exposes a known bug in Perl, should be skipped
3994// # b test exposes a known bug in Perl, should be skipped if noamp
3995// #
3996// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3997// #
3998// # Column 4 contains a string, usually C<$&>.
3999// #
4000// # Column 5 contains the expected result of double-quote
4001// # interpolating that string after the match, or start of error message.
4002// #
4003// # Column 6, if present, contains a reason why the test is skipped.
4004// # This is printed with "skipped", for harness to pick up.
4005// #
4006// # \n in the tests are interpolated, as are variables of the form ${\w+}.
4007// #
4008// # If you want to add a regular expression test that can't be expressed
4009// # in this format, don't add it here: put it in op/pat.t instead.
4010//
4011// For ICU, if field 3 contains an 'i', the test will be skipped.
4012// The test exposes is some known incompatibility between ICU and Perl regexps.
4013// (The i is in addition to whatever was there before.)
4014//
4015//-------------------------------------------------------------------------------
4016void RegexTest::PerlTests() {
4017 char tdd[2048];
4018 const char *srcPath;
4019 UErrorCode status = U_ZERO_ERROR;
4020 UParseError pe;
4021
4022 //
4023 // Open and read the test data file.
4024 //
4025 srcPath=getPath(tdd, "re_tests.txt");
4026 if(srcPath==NULL) {
4027 return; /* something went wrong, error already output */
4028 }
4029
4030 int32_t len;
4031 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4032 if (U_FAILURE(status)) {
4033 return; /* something went wrong, error already output */
4034 }
4035
4036 //
4037 // Put the test data into a UnicodeString
4038 //
4039 UnicodeString testDataString(FALSE, testData, len);
4040
4041 //
4042 // Regex to break the input file into lines, and strip the new lines.
4043 // One line per match, capture group one is the desired data.
4044 //
4045 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4046 if (U_FAILURE(status)) {
4047 dataerrln("RegexPattern::compile() error");
4048 return;
4049 }
4050 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4051
4052 //
4053 // Regex to split a test file line into fields.
4054 // There are six fields, separated by tabs.
4055 //
4056 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4057
4058 //
4059 // Regex to identify test patterns with flag settings, and to separate them.
4060 // Test patterns with flags look like 'pattern'i
4061 // Test patterns without flags are not quoted: pattern
4062 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4063 //
4064 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4065 RegexMatcher* flagMat = flagPat->matcher(status);
4066
4067 //
4068 // The Perl tests reference several perl-isms, which are evaluated/substituted
4069 // in the test data. Not being perl, this must be done explicitly. Here
4070 // are string constants and REs for these constructs.
4071 //
4072 UnicodeString nulnulSrc("${nulnul}");
4073 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4074 nulnul = nulnul.unescape();
4075
4076 UnicodeString ffffSrc("${ffff}");
4077 UnicodeString ffff("\\uffff", -1, US_INV);
4078 ffff = ffff.unescape();
4079
4080 // regexp for $-[0], $+[2], etc.
4081 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4082 RegexMatcher *groupsMat = groupsPat->matcher(status);
4083
4084 // regexp for $0, $1, $2, etc.
4085 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4086 RegexMatcher *cgMat = cgPat->matcher(status);
4087
4088
4089 //
4090 // Main Loop for the Perl Tests, runs once per line from the
4091 // test data file.
4092 //
4093 int32_t lineNum = 0;
4094 int32_t skippedUnimplementedCount = 0;
4095 while (lineMat->find()) {
4096 lineNum++;
4097
4098 //
4099 // Get a line, break it into its fields, do the Perl
4100 // variable substitutions.
4101 //
4102 UnicodeString line = lineMat->group(1, status);
4103 UnicodeString fields[7];
4104 fieldPat->split(line, fields, 7, status);
4105
4106 flagMat->reset(fields[0]);
4107 flagMat->matches(status);
4108 UnicodeString pattern = flagMat->group(2, status);
4109 pattern.findAndReplace("${bang}", "!");
4110 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4111 pattern.findAndReplace(ffffSrc, ffff);
4112
4113 //
4114 // Identify patterns that include match flag settings,
4115 // split off the flags, remove the extra quotes.
4116 //
4117 UnicodeString flagStr = flagMat->group(3, status);
4118 if (U_FAILURE(status)) {
4119 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4120 return;
4121 }
4122 int32_t flags = 0;
4123 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4124 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4125 const UChar UChar_m = 0x6d;
4126 const UChar UChar_x = 0x78;
4127 const UChar UChar_y = 0x79;
4128 if (flagStr.indexOf(UChar_i) != -1) {
4129 flags |= UREGEX_CASE_INSENSITIVE;
4130 }
4131 if (flagStr.indexOf(UChar_m) != -1) {
4132 flags |= UREGEX_MULTILINE;
4133 }
4134 if (flagStr.indexOf(UChar_x) != -1) {
4135 flags |= UREGEX_COMMENTS;
4136 }
4137
4138 //
4139 // Compile the test pattern.
4140 //
4141 status = U_ZERO_ERROR;
4142 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4143 if (status == U_REGEX_UNIMPLEMENTED) {
4144 //
4145 // Test of a feature that is planned for ICU, but not yet implemented.
4146 // skip the test.
4147 skippedUnimplementedCount++;
4148 delete testPat;
4149 status = U_ZERO_ERROR;
4150 continue;
4151 }
4152
4153 if (U_FAILURE(status)) {
4154 // Some tests are supposed to generate errors.
4155 // Only report an error for tests that are supposed to succeed.
4156 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4157 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4158 {
4159 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4160 }
4161 status = U_ZERO_ERROR;
4162 delete testPat;
4163 continue;
4164 }
4165
4166 if (fields[2].indexOf(UChar_i) >= 0) {
4167 // ICU should skip this test.
4168 delete testPat;
4169 continue;
4170 }
4171
4172 if (fields[2].indexOf(UChar_c) >= 0) {
4173 // This pattern should have caused a compilation error, but didn't/
4174 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4175 delete testPat;
4176 continue;
4177 }
4178
4179 //
4180 // replace the Perl variables that appear in some of the
4181 // match data strings.
4182 //
4183 UnicodeString matchString = fields[1];
4184 matchString.findAndReplace(nulnulSrc, nulnul);
4185 matchString.findAndReplace(ffffSrc, ffff);
4186
4187 // Replace any \n in the match string with an actual new-line char.
4188 // Don't do full unescape, as this unescapes more than Perl does, which
4189 // causes other spurious failures in the tests.
4190 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4191
4192
4193
4194 //
4195 // Run the test, check for expected match/don't match result.
4196 //
4197 RegexMatcher *testMat = testPat->matcher(matchString, status);
4198 UBool found = testMat->find();
4199 UBool expected = FALSE;
4200 if (fields[2].indexOf(UChar_y) >=0) {
4201 expected = TRUE;
4202 }
4203 if (expected != found) {
4204 errln("line %d: Expected %smatch, got %smatch",
4205 lineNum, expected?"":"no ", found?"":"no " );
4206 continue;
4207 }
4208
4209 // Don't try to check expected results if there is no match.
4210 // (Some have stuff in the expected fields)
4211 if (!found) {
4212 delete testMat;
4213 delete testPat;
4214 continue;
4215 }
4216
4217 //
4218 // Interpret the Perl expression from the fourth field of the data file,
4219 // building up an ICU string from the results of the ICU match.
4220 // The Perl expression will contain references to the results of
4221 // a regex match, including the matched string, capture group strings,
4222 // group starting and ending indicies, etc.
4223 //
4224 UnicodeString resultString;
4225 UnicodeString perlExpr = fields[3];
4226#if SUPPORT_MUTATING_INPUT_STRING
4227 groupsMat->reset(perlExpr);
4228 cgMat->reset(perlExpr);
4229#endif
4230
4231 while (perlExpr.length() > 0) {
4232#if !SUPPORT_MUTATING_INPUT_STRING
4233 // Perferred usage. Reset after any modification to input string.
4234 groupsMat->reset(perlExpr);
4235 cgMat->reset(perlExpr);
4236#endif
4237
4238 if (perlExpr.startsWith("$&")) {
4239 resultString.append(testMat->group(status));
4240 perlExpr.remove(0, 2);
4241 }
4242
4243 else if (groupsMat->lookingAt(status)) {
4244 // $-[0] $+[2] etc.
4245 UnicodeString digitString = groupsMat->group(2, status);
4246 int32_t t = 0;
4247 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4248 UnicodeString plusOrMinus = groupsMat->group(1, status);
4249 int32_t matchPosition;
4250 if (plusOrMinus.compare("+") == 0) {
4251 matchPosition = testMat->end(groupNum, status);
4252 } else {
4253 matchPosition = testMat->start(groupNum, status);
4254 }
4255 if (matchPosition != -1) {
4256 ICU_Utility::appendNumber(resultString, matchPosition);
4257 }
4258 perlExpr.remove(0, groupsMat->end(status));
4259 }
4260
4261 else if (cgMat->lookingAt(status)) {
4262 // $1, $2, $3, etc.
4263 UnicodeString digitString = cgMat->group(1, status);
4264 int32_t t = 0;
4265 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4266 if (U_SUCCESS(status)) {
4267 resultString.append(testMat->group(groupNum, status));
4268 status = U_ZERO_ERROR;
4269 }
4270 perlExpr.remove(0, cgMat->end(status));
4271 }
4272
4273 else if (perlExpr.startsWith("@-")) {
4274 int32_t i;
4275 for (i=0; i<=testMat->groupCount(); i++) {
4276 if (i>0) {
4277 resultString.append(" ");
4278 }
4279 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4280 }
4281 perlExpr.remove(0, 2);
4282 }
4283
4284 else if (perlExpr.startsWith("@+")) {
4285 int32_t i;
4286 for (i=0; i<=testMat->groupCount(); i++) {
4287 if (i>0) {
4288 resultString.append(" ");
4289 }
4290 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4291 }
4292 perlExpr.remove(0, 2);
4293 }
4294
4295 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4296 // or as an escaped sequence (e.g. \n)
4297 if (perlExpr.length() > 1) {
4298 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4299 }
4300 UChar c = perlExpr.charAt(0);
4301 switch (c) {
4302 case 'n': c = '\n'; break;
4303 // add any other escape sequences that show up in the test expected results.
4304 }
4305 resultString.append(c);
4306 perlExpr.remove(0, 1);
4307 }
4308
4309 else {
4310 // Any characters from the perl expression that we don't explicitly
4311 // recognize before here are assumed to be literals and copied
4312 // as-is to the expected results.
4313 resultString.append(perlExpr.charAt(0));
4314 perlExpr.remove(0, 1);
4315 }
4316
4317 if (U_FAILURE(status)) {
4318 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4319 break;
4320 }
4321 }
4322
4323 //
4324 // Expected Results Compare
4325 //
4326 UnicodeString expectedS(fields[4]);
4327 expectedS.findAndReplace(nulnulSrc, nulnul);
4328 expectedS.findAndReplace(ffffSrc, ffff);
4329 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4330
4331
4332 if (expectedS.compare(resultString) != 0) {
4333 err("Line %d: Incorrect perl expression results.", lineNum);
4334 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4335 }
4336
4337 delete testMat;
4338 delete testPat;
4339 }
4340
4341 //
4342 // All done. Clean up allocated stuff.
4343 //
4344 delete cgMat;
4345 delete cgPat;
4346
4347 delete groupsMat;
4348 delete groupsPat;
4349
4350 delete flagMat;
4351 delete flagPat;
4352
4353 delete lineMat;
4354 delete linePat;
4355
4356 delete fieldPat;
4357 delete [] testData;
4358
4359
4360 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4361
4362}
4363
4364
4365//-------------------------------------------------------------------------------
4366//
4367// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4368// (instead of using UnicodeStrings) to test the alternate engine.
4369// The input file for this test is re_tests, the standard regular
4370// expression test data distributed with the Perl source code.
4371// See PerlTests() for more information.
4372//
4373//-------------------------------------------------------------------------------
4374void RegexTest::PerlTestsUTF8() {
4375 char tdd[2048];
4376 const char *srcPath;
4377 UErrorCode status = U_ZERO_ERROR;
4378 UParseError pe;
4379 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4380 UText patternText = UTEXT_INITIALIZER;
4381 char *patternChars = NULL;
4382 int32_t patternLength;
4383 int32_t patternCapacity = 0;
4384 UText inputText = UTEXT_INITIALIZER;
4385 char *inputChars = NULL;
4386 int32_t inputLength;
4387 int32_t inputCapacity = 0;
4388
4389 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4390
4391 //
4392 // Open and read the test data file.
4393 //
4394 srcPath=getPath(tdd, "re_tests.txt");
4395 if(srcPath==NULL) {
4396 return; /* something went wrong, error already output */
4397 }
4398
4399 int32_t len;
4400 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4401 if (U_FAILURE(status)) {
4402 return; /* something went wrong, error already output */
4403 }
4404
4405 //
4406 // Put the test data into a UnicodeString
4407 //
4408 UnicodeString testDataString(FALSE, testData, len);
4409
4410 //
4411 // Regex to break the input file into lines, and strip the new lines.
4412 // One line per match, capture group one is the desired data.
4413 //
4414 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4415 if (U_FAILURE(status)) {
4416 dataerrln("RegexPattern::compile() error");
4417 return;
4418 }
4419 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4420
4421 //
4422 // Regex to split a test file line into fields.
4423 // There are six fields, separated by tabs.
4424 //
4425 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4426
4427 //
4428 // Regex to identify test patterns with flag settings, and to separate them.
4429 // Test patterns with flags look like 'pattern'i
4430 // Test patterns without flags are not quoted: pattern
4431 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4432 //
4433 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4434 RegexMatcher* flagMat = flagPat->matcher(status);
4435
4436 //
4437 // The Perl tests reference several perl-isms, which are evaluated/substituted
4438 // in the test data. Not being perl, this must be done explicitly. Here
4439 // are string constants and REs for these constructs.
4440 //
4441 UnicodeString nulnulSrc("${nulnul}");
4442 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4443 nulnul = nulnul.unescape();
4444
4445 UnicodeString ffffSrc("${ffff}");
4446 UnicodeString ffff("\\uffff", -1, US_INV);
4447 ffff = ffff.unescape();
4448
4449 // regexp for $-[0], $+[2], etc.
4450 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4451 RegexMatcher *groupsMat = groupsPat->matcher(status);
4452
4453 // regexp for $0, $1, $2, etc.
4454 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4455 RegexMatcher *cgMat = cgPat->matcher(status);
4456
4457
4458 //
4459 // Main Loop for the Perl Tests, runs once per line from the
4460 // test data file.
4461 //
4462 int32_t lineNum = 0;
4463 int32_t skippedUnimplementedCount = 0;
4464 while (lineMat->find()) {
4465 lineNum++;
4466
4467 //
4468 // Get a line, break it into its fields, do the Perl
4469 // variable substitutions.
4470 //
4471 UnicodeString line = lineMat->group(1, status);
4472 UnicodeString fields[7];
4473 fieldPat->split(line, fields, 7, status);
4474
4475 flagMat->reset(fields[0]);
4476 flagMat->matches(status);
4477 UnicodeString pattern = flagMat->group(2, status);
4478 pattern.findAndReplace("${bang}", "!");
4479 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4480 pattern.findAndReplace(ffffSrc, ffff);
4481
4482 //
4483 // Identify patterns that include match flag settings,
4484 // split off the flags, remove the extra quotes.
4485 //
4486 UnicodeString flagStr = flagMat->group(3, status);
4487 if (U_FAILURE(status)) {
4488 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4489 return;
4490 }
4491 int32_t flags = 0;
4492 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4493 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4494 const UChar UChar_m = 0x6d;
4495 const UChar UChar_x = 0x78;
4496 const UChar UChar_y = 0x79;
4497 if (flagStr.indexOf(UChar_i) != -1) {
4498 flags |= UREGEX_CASE_INSENSITIVE;
4499 }
4500 if (flagStr.indexOf(UChar_m) != -1) {
4501 flags |= UREGEX_MULTILINE;
4502 }
4503 if (flagStr.indexOf(UChar_x) != -1) {
4504 flags |= UREGEX_COMMENTS;
4505 }
4506
4507 //
4508 // Put the pattern in a UTF-8 UText
4509 //
4510 status = U_ZERO_ERROR;
4511 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4512 if (status == U_BUFFER_OVERFLOW_ERROR) {
4513 status = U_ZERO_ERROR;
4514 delete[] patternChars;
4515 patternCapacity = patternLength + 1;
4516 patternChars = new char[patternCapacity];
4517 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4518 }
4519 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4520
4521 //
4522 // Compile the test pattern.
4523 //
4524 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4525 if (status == U_REGEX_UNIMPLEMENTED) {
4526 //
4527 // Test of a feature that is planned for ICU, but not yet implemented.
4528 // skip the test.
4529 skippedUnimplementedCount++;
4530 delete testPat;
4531 status = U_ZERO_ERROR;
4532 continue;
4533 }
4534
4535 if (U_FAILURE(status)) {
4536 // Some tests are supposed to generate errors.
4537 // Only report an error for tests that are supposed to succeed.
4538 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4539 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4540 {
4541 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4542 }
4543 status = U_ZERO_ERROR;
4544 delete testPat;
4545 continue;
4546 }
4547
4548 if (fields[2].indexOf(UChar_i) >= 0) {
4549 // ICU should skip this test.
4550 delete testPat;
4551 continue;
4552 }
4553
4554 if (fields[2].indexOf(UChar_c) >= 0) {
4555 // This pattern should have caused a compilation error, but didn't/
4556 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4557 delete testPat;
4558 continue;
4559 }
4560
4561
4562 //
4563 // replace the Perl variables that appear in some of the
4564 // match data strings.
4565 //
4566 UnicodeString matchString = fields[1];
4567 matchString.findAndReplace(nulnulSrc, nulnul);
4568 matchString.findAndReplace(ffffSrc, ffff);
4569
4570 // Replace any \n in the match string with an actual new-line char.
4571 // Don't do full unescape, as this unescapes more than Perl does, which
4572 // causes other spurious failures in the tests.
4573 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4574
4575 //
4576 // Put the input in a UTF-8 UText
4577 //
4578 status = U_ZERO_ERROR;
4579 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4580 if (status == U_BUFFER_OVERFLOW_ERROR) {
4581 status = U_ZERO_ERROR;
4582 delete[] inputChars;
4583 inputCapacity = inputLength + 1;
4584 inputChars = new char[inputCapacity];
4585 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4586 }
4587 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4588
4589 //
4590 // Run the test, check for expected match/don't match result.
4591 //
4592 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4593 UBool found = testMat->find();
4594 UBool expected = FALSE;
4595 if (fields[2].indexOf(UChar_y) >=0) {
4596 expected = TRUE;
4597 }
4598 if (expected != found) {
4599 errln("line %d: Expected %smatch, got %smatch",
4600 lineNum, expected?"":"no ", found?"":"no " );
4601 continue;
4602 }
4603
4604 // Don't try to check expected results if there is no match.
4605 // (Some have stuff in the expected fields)
4606 if (!found) {
4607 delete testMat;
4608 delete testPat;
4609 continue;
4610 }
4611
4612 //
4613 // Interpret the Perl expression from the fourth field of the data file,
4614 // building up an ICU string from the results of the ICU match.
4615 // The Perl expression will contain references to the results of
4616 // a regex match, including the matched string, capture group strings,
4617 // group starting and ending indicies, etc.
4618 //
4619 UnicodeString resultString;
4620 UnicodeString perlExpr = fields[3];
4621
4622 while (perlExpr.length() > 0) {
4623 groupsMat->reset(perlExpr);
4624 cgMat->reset(perlExpr);
4625
4626 if (perlExpr.startsWith("$&")) {
4627 resultString.append(testMat->group(status));
4628 perlExpr.remove(0, 2);
4629 }
4630
4631 else if (groupsMat->lookingAt(status)) {
4632 // $-[0] $+[2] etc.
4633 UnicodeString digitString = groupsMat->group(2, status);
4634 int32_t t = 0;
4635 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4636 UnicodeString plusOrMinus = groupsMat->group(1, status);
4637 int32_t matchPosition;
4638 if (plusOrMinus.compare("+") == 0) {
4639 matchPosition = testMat->end(groupNum, status);
4640 } else {
4641 matchPosition = testMat->start(groupNum, status);
4642 }
4643 if (matchPosition != -1) {
4644 ICU_Utility::appendNumber(resultString, matchPosition);
4645 }
4646 perlExpr.remove(0, groupsMat->end(status));
4647 }
4648
4649 else if (cgMat->lookingAt(status)) {
4650 // $1, $2, $3, etc.
4651 UnicodeString digitString = cgMat->group(1, status);
4652 int32_t t = 0;
4653 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4654 if (U_SUCCESS(status)) {
4655 resultString.append(testMat->group(groupNum, status));
4656 status = U_ZERO_ERROR;
4657 }
4658 perlExpr.remove(0, cgMat->end(status));
4659 }
4660
4661 else if (perlExpr.startsWith("@-")) {
4662 int32_t i;
4663 for (i=0; i<=testMat->groupCount(); i++) {
4664 if (i>0) {
4665 resultString.append(" ");
4666 }
4667 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4668 }
4669 perlExpr.remove(0, 2);
4670 }
4671
4672 else if (perlExpr.startsWith("@+")) {
4673 int32_t i;
4674 for (i=0; i<=testMat->groupCount(); i++) {
4675 if (i>0) {
4676 resultString.append(" ");
4677 }
4678 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4679 }
4680 perlExpr.remove(0, 2);
4681 }
4682
4683 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4684 // or as an escaped sequence (e.g. \n)
4685 if (perlExpr.length() > 1) {
4686 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4687 }
4688 UChar c = perlExpr.charAt(0);
4689 switch (c) {
4690 case 'n': c = '\n'; break;
4691 // add any other escape sequences that show up in the test expected results.
4692 }
4693 resultString.append(c);
4694 perlExpr.remove(0, 1);
4695 }
4696
4697 else {
4698 // Any characters from the perl expression that we don't explicitly
4699 // recognize before here are assumed to be literals and copied
4700 // as-is to the expected results.
4701 resultString.append(perlExpr.charAt(0));
4702 perlExpr.remove(0, 1);
4703 }
4704
4705 if (U_FAILURE(status)) {
4706 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4707 break;
4708 }
4709 }
4710
4711 //
4712 // Expected Results Compare
4713 //
4714 UnicodeString expectedS(fields[4]);
4715 expectedS.findAndReplace(nulnulSrc, nulnul);
4716 expectedS.findAndReplace(ffffSrc, ffff);
4717 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4718
4719
4720 if (expectedS.compare(resultString) != 0) {
4721 err("Line %d: Incorrect perl expression results.", lineNum);
4722 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4723 }
4724
4725 delete testMat;
4726 delete testPat;
4727 }
4728
4729 //
4730 // All done. Clean up allocated stuff.
4731 //
4732 delete cgMat;
4733 delete cgPat;
4734
4735 delete groupsMat;
4736 delete groupsPat;
4737
4738 delete flagMat;
4739 delete flagPat;
4740
4741 delete lineMat;
4742 delete linePat;
4743
4744 delete fieldPat;
4745 delete [] testData;
4746
4747 utext_close(&patternText);
4748 utext_close(&inputText);
4749
4750 delete [] patternChars;
4751 delete [] inputChars;
4752
4753
4754 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4755
4756}
4757
4758
4759//--------------------------------------------------------------
4760//
4761// Bug6149 Verify limits to heap expansion for backtrack stack.
4762// Use this pattern,
4763// "(a?){1,8000000}"
4764// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4765// This test is likely to be fragile, as further optimizations stop
4766// more cases of pointless looping in the match engine.
4767//
4768//---------------------------------------------------------------
4769void RegexTest::Bug6149() {
4770 UnicodeString pattern("(a?){1,8000000}");
4771 UnicodeString s("xyz");
4772 uint32_t flags = 0;
4773 UErrorCode status = U_ZERO_ERROR;
4774
4775 RegexMatcher matcher(pattern, s, flags, status);
4776 UBool result = false;
4777 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4778 REGEX_ASSERT(result == FALSE);
4779 }
4780
4781
4782//
4783// Callbacks() Test the callback function.
4784// When set, callbacks occur periodically during matching operations,
4785// giving the application code the ability to abort the operation
4786// before it's normal completion.
4787//
4788
4789struct callBackContext {
4790 RegexTest *test;
4791 int32_t maxCalls;
4792 int32_t numCalls;
4793 int32_t lastSteps;
4794 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4795};
4796
4797U_CDECL_BEGIN
4798static UBool U_CALLCONV
4799testCallBackFn(const void *context, int32_t steps) {
4800 callBackContext *info = (callBackContext *)context;
4801 if (info->lastSteps+1 != steps) {
4802 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4803 }
4804 info->lastSteps = steps;
4805 info->numCalls++;
4806 return (info->numCalls < info->maxCalls);
4807}
4808U_CDECL_END
4809
4810void RegexTest::Callbacks() {
4811 {
4812 // Getter returns NULLs if no callback has been set
4813
4814 // The variables that the getter will fill in.
4815 // Init to non-null values so that the action of the getter can be seen.
4816 const void *returnedContext = &returnedContext;
4817 URegexMatchCallback *returnedFn = &testCallBackFn;
4818
4819 UErrorCode status = U_ZERO_ERROR;
4820 RegexMatcher matcher("x", 0, status);
4821 REGEX_CHECK_STATUS;
4822 matcher.getMatchCallback(returnedFn, returnedContext, status);
4823 REGEX_CHECK_STATUS;
4824 REGEX_ASSERT(returnedFn == NULL);
4825 REGEX_ASSERT(returnedContext == NULL);
4826 }
4827
4828 {
4829 // Set and Get work
4830 callBackContext cbInfo = {this, 0, 0, 0};
4831 const void *returnedContext;
4832 URegexMatchCallback *returnedFn;
4833 UErrorCode status = U_ZERO_ERROR;
4834 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4835 REGEX_CHECK_STATUS;
4836 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4837 REGEX_CHECK_STATUS;
4838 matcher.getMatchCallback(returnedFn, returnedContext, status);
4839 REGEX_CHECK_STATUS;
4840 REGEX_ASSERT(returnedFn == testCallBackFn);
4841 REGEX_ASSERT(returnedContext == &cbInfo);
4842
4843 // A short-running match shouldn't invoke the callback
4844 status = U_ZERO_ERROR;
4845 cbInfo.reset(1);
4846 UnicodeString s = "xxx";
4847 matcher.reset(s);
4848 REGEX_ASSERT(matcher.matches(status));
4849 REGEX_CHECK_STATUS;
4850 REGEX_ASSERT(cbInfo.numCalls == 0);
4851
4852 // A medium-length match that runs long enough to invoke the
4853 // callback, but not so long that the callback aborts it.
4854 status = U_ZERO_ERROR;
4855 cbInfo.reset(4);
4856 s = "aaaaaaaaaaaaaaaaaaab";
4857 matcher.reset(s);
4858 REGEX_ASSERT(matcher.matches(status)==FALSE);
4859 REGEX_CHECK_STATUS;
4860 REGEX_ASSERT(cbInfo.numCalls > 0);
4861
4862 // A longer running match that the callback function will abort.
4863 status = U_ZERO_ERROR;
4864 cbInfo.reset(4);
4865 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4866 matcher.reset(s);
4867 REGEX_ASSERT(matcher.matches(status)==FALSE);
4868 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4869 REGEX_ASSERT(cbInfo.numCalls == 4);
4870
4871 // A longer running find that the callback function will abort.
4872 status = U_ZERO_ERROR;
4873 cbInfo.reset(4);
4874 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4875 matcher.reset(s);
4876 REGEX_ASSERT(matcher.find(status)==FALSE);
4877 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4878 REGEX_ASSERT(cbInfo.numCalls == 4);
4879 }
4880
4881
4882}
4883
4884
4885//
4886// FindProgressCallbacks() Test the find "progress" callback function.
4887// When set, the find progress callback will be invoked during a find operations
4888// after each return from a match attempt, giving the application the opportunity
4889// to terminate a long-running find operation before it's normal completion.
4890//
4891
4892struct progressCallBackContext {
4893 RegexTest *test;
4894 int64_t lastIndex;
4895 int32_t maxCalls;
4896 int32_t numCalls;
4897 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4898};
4899
4900// call-back function for find().
4901// Return TRUE to continue the find().
4902// Return FALSE to stop the find().
4903U_CDECL_BEGIN
4904static UBool U_CALLCONV
4905testProgressCallBackFn(const void *context, int64_t matchIndex) {
4906 progressCallBackContext *info = (progressCallBackContext *)context;
4907 info->numCalls++;
4908 info->lastIndex = matchIndex;
4909// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4910 return (info->numCalls < info->maxCalls);
4911}
4912U_CDECL_END
4913
4914void RegexTest::FindProgressCallbacks() {
4915 {
4916 // Getter returns NULLs if no callback has been set
4917
4918 // The variables that the getter will fill in.
4919 // Init to non-null values so that the action of the getter can be seen.
4920 const void *returnedContext = &returnedContext;
4921 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4922
4923 UErrorCode status = U_ZERO_ERROR;
4924 RegexMatcher matcher("x", 0, status);
4925 REGEX_CHECK_STATUS;
4926 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4927 REGEX_CHECK_STATUS;
4928 REGEX_ASSERT(returnedFn == NULL);
4929 REGEX_ASSERT(returnedContext == NULL);
4930 }
4931
4932 {
4933 // Set and Get work
4934 progressCallBackContext cbInfo = {this, 0, 0, 0};
4935 const void *returnedContext;
4936 URegexFindProgressCallback *returnedFn;
4937 UErrorCode status = U_ZERO_ERROR;
4938 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4939 REGEX_CHECK_STATUS;
4940 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4941 REGEX_CHECK_STATUS;
4942 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4943 REGEX_CHECK_STATUS;
4944 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4945 REGEX_ASSERT(returnedContext == &cbInfo);
4946
4947 // A find that matches on the initial position does NOT invoke the callback.
4948 status = U_ZERO_ERROR;
4949 cbInfo.reset(100);
4950 UnicodeString s = "aaxxx";
4951 matcher.reset(s);
4952#if 0
4953 matcher.setTrace(TRUE);
4954#endif
4955 REGEX_ASSERT(matcher.find(0, status));
4956 REGEX_CHECK_STATUS;
4957 REGEX_ASSERT(cbInfo.numCalls == 0);
4958
4959 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4960 // but not so many times that we interrupt the operation.
4961 status = U_ZERO_ERROR;
4962 s = "aaaaaaaaaaaaaaaaaaab";
4963 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4964 matcher.reset(s);
4965 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4966 REGEX_CHECK_STATUS;
4967 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4968
4969 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4970 status = U_ZERO_ERROR;
4971 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4972 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4973 matcher.reset(s1);
4974 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4975 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4976 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4977
4978 // Now a match that will succeed, but after an interruption
4979 status = U_ZERO_ERROR;
4980 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4981 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4982 matcher.reset(s2);
4983 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4984 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4985 // Now retry the match from where left off
4986 cbInfo.maxCalls = 100; // No callback limit
4987 status = U_ZERO_ERROR;
4988 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4989 REGEX_CHECK_STATUS;
4990 }
4991
4992
4993}
4994
4995
4996//---------------------------------------------------------------------------
4997//
4998// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4999// UTexts. The pure-C implementation of UText
5000// has no mutable backing stores, but we can
5001// use UnicodeString here to test the functionality.
5002//
5003//---------------------------------------------------------------------------
5004void RegexTest::PreAllocatedUTextCAPI () {
5005 UErrorCode status = U_ZERO_ERROR;
5006 URegularExpression *re;
5007 UText patternText = UTEXT_INITIALIZER;
5008 UnicodeString buffer;
5009 UText bufferText = UTEXT_INITIALIZER;
5010
5011 utext_openUnicodeString(&bufferText, &buffer, &status);
5012
5013 /*
5014 * getText() and getUText()
5015 */
5016 {
5017 UText text1 = UTEXT_INITIALIZER;
5018 UText text2 = UTEXT_INITIALIZER;
5019 UChar text2Chars[20];
5020 UText *resultText;
5021
5022 status = U_ZERO_ERROR;
5023 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5024 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5025 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5026 utext_openUChars(&text2, text2Chars, -1, &status);
5027
5028 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5029 re = uregex_openUText(&patternText, 0, NULL, &status);
5030
5031 /* First set a UText */
5032 uregex_setUText(re, &text1, &status);
5033 resultText = uregex_getUText(re, &bufferText, &status);
5034 REGEX_CHECK_STATUS;
5035 REGEX_ASSERT(resultText == &bufferText);
5036 utext_setNativeIndex(resultText, 0);
5037 utext_setNativeIndex(&text1, 0);
5038 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5039
5040 resultText = uregex_getUText(re, &bufferText, &status);
5041 REGEX_CHECK_STATUS;
5042 REGEX_ASSERT(resultText == &bufferText);
5043 utext_setNativeIndex(resultText, 0);
5044 utext_setNativeIndex(&text1, 0);
5045 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5046
5047 /* Then set a UChar * */
5048 uregex_setText(re, text2Chars, 7, &status);
5049 resultText = uregex_getUText(re, &bufferText, &status);
5050 REGEX_CHECK_STATUS;
5051 REGEX_ASSERT(resultText == &bufferText);
5052 utext_setNativeIndex(resultText, 0);
5053 utext_setNativeIndex(&text2, 0);
5054 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5055
5056 uregex_close(re);
5057 utext_close(&text1);
5058 utext_close(&text2);
5059 }
5060
5061 /*
5062 * group()
5063 */
5064 {
5065 UChar text1[80];
5066 UText *actual;
5067 UBool result;
5068 int64_t length = 0;
5069
5070 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5071 // 012345678901234567890123456789012345678901234567
5072 // 0 1 2 3 4
5073
5074 status = U_ZERO_ERROR;
5075 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5076 REGEX_CHECK_STATUS;
5077
5078 uregex_setText(re, text1, -1, &status);
5079 result = uregex_find(re, 0, &status);
5080 REGEX_ASSERT(result==TRUE);
5081
5082 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5083 status = U_ZERO_ERROR;
5084 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5085 REGEX_CHECK_STATUS;
5086 REGEX_ASSERT(actual == &bufferText);
5087 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5088 REGEX_ASSERT(length == 16);
5089 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5090
5091 /* Capture group #1. Should succeed, matching " interior ". */
5092 status = U_ZERO_ERROR;
5093 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5094 REGEX_CHECK_STATUS;
5095 REGEX_ASSERT(actual == &bufferText);
5096 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5097 REGEX_ASSERT(length == 10);
5098 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5099
5100 /* Capture group out of range. Error. */
5101 status = U_ZERO_ERROR;
5102 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5103 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5104 REGEX_ASSERT(actual == &bufferText);
5105 uregex_close(re);
5106
5107 }
5108
5109 /*
5110 * replaceFirst()
5111 */
5112 {
5113 UChar text1[80];
5114 UChar text2[80];
5115 UText replText = UTEXT_INITIALIZER;
5116 UText *result;
5117 status = U_ZERO_ERROR;
5118 utext_openUnicodeString(&bufferText, &buffer, &status);
5119
5120 status = U_ZERO_ERROR;
5121 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5122 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5123 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5124
5125 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5126 REGEX_CHECK_STATUS;
5127
5128 /* Normal case, with match */
5129 uregex_setText(re, text1, -1, &status);
5130 REGEX_CHECK_STATUS;
5131 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5132 REGEX_CHECK_STATUS;
5133 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5134 REGEX_CHECK_STATUS;
5135 REGEX_ASSERT(result == &bufferText);
5136 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5137
5138 /* No match. Text should copy to output with no changes. */
5139 uregex_setText(re, text2, -1, &status);
5140 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5141 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5142 REGEX_CHECK_STATUS;
5143 REGEX_ASSERT(result == &bufferText);
5144 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5145
5146 /* Unicode escapes */
5147 uregex_setText(re, text1, -1, &status);
5148 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5149 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5151 REGEX_CHECK_STATUS;
5152 REGEX_ASSERT(result == &bufferText);
5153 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5154
5155 uregex_close(re);
5156 utext_close(&replText);
5157 }
5158
5159
5160 /*
5161 * replaceAll()
5162 */
5163 {
5164 UChar text1[80];
5165 UChar text2[80];
5166 UText replText = UTEXT_INITIALIZER;
5167 UText *result;
5168
5169 status = U_ZERO_ERROR;
5170 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5171 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5172 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5173
5174 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5175 REGEX_CHECK_STATUS;
5176
5177 /* Normal case, with match */
5178 uregex_setText(re, text1, -1, &status);
5179 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5180 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5181 REGEX_CHECK_STATUS;
5182 REGEX_ASSERT(result == &bufferText);
5183 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5184
5185 /* No match. Text should copy to output with no changes. */
5186 uregex_setText(re, text2, -1, &status);
5187 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5188 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5189 REGEX_CHECK_STATUS;
5190 REGEX_ASSERT(result == &bufferText);
5191 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5192
5193 uregex_close(re);
5194 utext_close(&replText);
5195 }
5196
5197
5198 /*
5199 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5200 * so we don't need to test it here.
5201 */
5202
5203 utext_close(&bufferText);
5204 utext_close(&patternText);
5205}
5206
5207
5208//--------------------------------------------------------------
5209//
5210// NamedCapture Check basic named capture group functionality
5211//
5212//--------------------------------------------------------------
5213void RegexTest::NamedCapture() {
5214 UErrorCode status = U_ZERO_ERROR;
5215 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5216 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5217 REGEX_CHECK_STATUS;
5218 int32_t group = pat->groupNumberFromName("five", -1, status);
5219 REGEX_CHECK_STATUS;
5220 REGEX_ASSERT(5 == group);
5221 group = pat->groupNumberFromName("three", -1, status);
5222 REGEX_CHECK_STATUS;
5223 REGEX_ASSERT(3 == group);
5224
5225 status = U_ZERO_ERROR;
5226 group = pat->groupNumberFromName(UnicodeString("six"), status);
5227 REGEX_CHECK_STATUS;
5228 REGEX_ASSERT(6 == group);
5229
5230 status = U_ZERO_ERROR;
5231 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5232 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5233
5234 status = U_ZERO_ERROR;
5235
5236 // After copying a pattern, named capture should still work in the copy.
5237 RegexPattern *copiedPat = new RegexPattern(*pat);
5238 REGEX_ASSERT(*copiedPat == *pat);
5239 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5240
5241 group = copiedPat->groupNumberFromName("five", -1, status);
5242 REGEX_CHECK_STATUS;
5243 REGEX_ASSERT(5 == group);
5244 group = copiedPat->groupNumberFromName("three", -1, status);
5245 REGEX_CHECK_STATUS;
5246 REGEX_ASSERT(3 == group);
5247 delete copiedPat;
5248
5249 // ReplaceAll with named capture group.
5250 status = U_ZERO_ERROR;
5251 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5252 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5253 REGEX_CHECK_STATUS;
5254 // m.pattern().dumpPattern();
5255 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5256 REGEX_CHECK_STATUS;
5257 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5258 delete m;
5259
5260 // ReplaceAll, allowed capture group numbers.
5261 text = UnicodeString("abcmxyz");
5262 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5263 REGEX_CHECK_STATUS;
5264
5265 status = U_ZERO_ERROR;
5266 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5267 REGEX_CHECK_STATUS;
5268 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5269
5270 status = U_ZERO_ERROR;
5271 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5272 REGEX_CHECK_STATUS;
5273 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5274
5275 status = U_ZERO_ERROR;
5276 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5277 REGEX_CHECK_STATUS;
5278 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5279
5280 status = U_ZERO_ERROR;
5281 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5282 REGEX_CHECK_STATUS;
5283 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5284
5285 status = U_ZERO_ERROR;
5286 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5287 REGEX_CHECK_STATUS;
5288 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5289
5290 status = U_ZERO_ERROR;
5291 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5292 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5293
5294 status = U_ZERO_ERROR;
5295 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5296 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5297 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5298
5299 status = U_ZERO_ERROR;
5300 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5301 REGEX_CHECK_STATUS; // that push group num out of range.
5302 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5303
5304 status = U_ZERO_ERROR;
5305 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5306 REGEX_CHECK_STATUS;
5307 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5308
5309 status = U_ZERO_ERROR;
5310 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5311 REGEX_CHECK_STATUS;
5312 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5313
5314 status = U_ZERO_ERROR;
5315 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5316 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5317
5318 status = U_ZERO_ERROR;
5319 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5320 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5321
5322 status = U_ZERO_ERROR;
5323 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5324 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5325
5326 status = U_ZERO_ERROR;
5327 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5328 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5329
5330 delete m;
5331
5332 // Repeat the above replaceAll() tests using the plain C API, which
5333 // has a separate implementation internally.
5334 // TODO: factor out the test data.
5335
5336 status = U_ZERO_ERROR;
5337 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5338 REGEX_CHECK_STATUS;
5339 text = UnicodeString("abcmxyz");
5340 uregex_setText(re, text.getBuffer(), text.length(), &status);
5341 REGEX_CHECK_STATUS;
5342
5343 UChar resultBuf[100];
5344 int32_t resultLength;
5345 UnicodeString repl;
5346
5347 status = U_ZERO_ERROR;
5348 repl = UnicodeString("<$0>");
5349 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5350 REGEX_CHECK_STATUS;
5351 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5352
5353 status = U_ZERO_ERROR;
5354 repl = UnicodeString("<$1>");
5355 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5356 REGEX_CHECK_STATUS;
5357 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5358
5359 status = U_ZERO_ERROR;
5360 repl = UnicodeString("<${one}>");
5361 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5362 REGEX_CHECK_STATUS;
5363 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5364
5365 status = U_ZERO_ERROR;
5366 repl = UnicodeString("<$2>");
5367 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5368 REGEX_CHECK_STATUS;
5369 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5370
5371 status = U_ZERO_ERROR;
5372 repl = UnicodeString("<$3>");
5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374 REGEX_CHECK_STATUS;
5375 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5376
5377 status = U_ZERO_ERROR;
5378 repl = UnicodeString("<$4>");
5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5381
5382 status = U_ZERO_ERROR;
5383 repl = UnicodeString("<$04>");
5384 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5385 REGEX_CHECK_STATUS;
5386 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5387
5388 status = U_ZERO_ERROR;
5389 repl = UnicodeString("<$000016>");
5390 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5391 REGEX_CHECK_STATUS;
5392 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5393
5394 status = U_ZERO_ERROR;
5395 repl = UnicodeString("<$3$2$1${one}>");
5396 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5397 REGEX_CHECK_STATUS;
5398 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5399
5400 status = U_ZERO_ERROR;
5401 repl = UnicodeString("$3$2$1${one}");
5402 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5403 REGEX_CHECK_STATUS;
5404 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5405
5406 status = U_ZERO_ERROR;
5407 repl = UnicodeString("<${noSuchName}>");
5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5410
5411 status = U_ZERO_ERROR;
5412 repl = UnicodeString("<${invalid-name}>");
5413 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5414 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5415
5416 status = U_ZERO_ERROR;
5417 repl = UnicodeString("<${one");
5418 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5419 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5420
5421 status = U_ZERO_ERROR;
5422 repl = UnicodeString("$not a capture group");
5423 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5424 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425
5426 uregex_close(re);
5427}
5428
5429//--------------------------------------------------------------
5430//
5431// NamedCaptureLimits Patterns with huge numbers of named capture groups.
5432// The point is not so much what the exact limit is,
5433// but that a largish number doesn't hit bad non-linear performance,
5434// and that exceeding the limit fails cleanly.
5435//
5436//--------------------------------------------------------------
5437void RegexTest::NamedCaptureLimits() {
5438 if (quick) {
5439 logln("Skipping test. Runs in exhuastive mode only.");
5440 return;
5441 }
5442 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5443 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5444 char nnbuf[100];
5445 UnicodeString pattern;
5446 int32_t nn;
5447
5448 for (nn=1; nn<goodLimit; nn++) {
5449 sprintf(nnbuf, "(?<nn%d>)", nn);
5450 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5451 }
5452 UErrorCode status = U_ZERO_ERROR;
5453 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5454 REGEX_CHECK_STATUS;
5455 for (nn=1; nn<goodLimit; nn++) {
5456 sprintf(nnbuf, "nn%d", nn);
5457 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5458 REGEX_ASSERT(nn == groupNum);
5459 if (nn != groupNum) {
5460 break;
5461 }
5462 }
5463 delete pat;
5464
5465 pattern.remove();
5466 for (nn=1; nn<failLimit; nn++) {
5467 sprintf(nnbuf, "(?<nn%d>)", nn);
5468 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5469 }
5470 status = U_ZERO_ERROR;
5471 pat = RegexPattern::compile(pattern, 0, status);
5472 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5473 delete pat;
5474}
5475
5476
5477//--------------------------------------------------------------
5478//
5479// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5480//
5481//---------------------------------------------------------------
5482void RegexTest::Bug7651() {
5483 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5484 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5485 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5486 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5487 UnicodeString s("#ff @abcd This is test");
5488 RegexPattern *REPattern = NULL;
5489 RegexMatcher *REMatcher = NULL;
5490 UErrorCode status = U_ZERO_ERROR;
5491 UParseError pe;
5492
5493 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5494 REGEX_CHECK_STATUS;
5495 REMatcher = REPattern->matcher(s, status);
5496 REGEX_CHECK_STATUS;
5497 REGEX_ASSERT(REMatcher->find());
5498 REGEX_ASSERT(REMatcher->start(status) == 0);
5499 delete REPattern;
5500 delete REMatcher;
5501 status = U_ZERO_ERROR;
5502
5503 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5504 REGEX_CHECK_STATUS;
5505 REMatcher = REPattern->matcher(s, status);
5506 REGEX_CHECK_STATUS;
5507 REGEX_ASSERT(REMatcher->find());
5508 REGEX_ASSERT(REMatcher->start(status) == 0);
5509 delete REPattern;
5510 delete REMatcher;
5511 status = U_ZERO_ERROR;
5512 }
5513
5514void RegexTest::Bug7740() {
5515 UErrorCode status = U_ZERO_ERROR;
5516 UnicodeString pattern = "(a)";
5517 UnicodeString text = "abcdef";
5518 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5519 REGEX_CHECK_STATUS;
5520 REGEX_ASSERT(m->lookingAt(status));
5521 REGEX_CHECK_STATUS;
5522 status = U_ILLEGAL_ARGUMENT_ERROR;
5523 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5524 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5525 REGEX_ASSERT(s == "");
5526 delete m;
5527}
5528
5529// Bug 8479: was crashing whith a Bogus UnicodeString as input.
5530
5531void RegexTest::Bug8479() {
5532 UErrorCode status = U_ZERO_ERROR;
5533
5534 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5535 REGEX_CHECK_STATUS;
5536 if (U_SUCCESS(status))
5537 {
5538 UnicodeString str;
5539 str.setToBogus();
5540 pMatcher->reset(str);
5541 status = U_ZERO_ERROR;
5542 pMatcher->matches(status);
5543 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5544 delete pMatcher;
5545 }
5546}
5547
5548
5549// Bug 7029
5550void RegexTest::Bug7029() {
5551 UErrorCode status = U_ZERO_ERROR;
5552
5553 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5554 UnicodeString text = "abc.def";
5555 UnicodeString splits[10];
5556 REGEX_CHECK_STATUS;
5557 int32_t numFields = pMatcher->split(text, splits, 10, status);
5558 REGEX_CHECK_STATUS;
5559 REGEX_ASSERT(numFields == 8);
5560 delete pMatcher;
5561}
5562
5563// Bug 9283
5564// This test is checking for the existance of any supplemental characters that case-fold
5565// to a bmp character.
5566//
5567// At the time of this writing there are none. If any should appear in a subsequent release
5568// of Unicode, the code in regular expressions compilation that determines the longest
5569// posssible match for a literal string will need to be enhanced.
5570//
5571// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5572// for details on what to do in case of a failure of this test.
5573//
5574void RegexTest::Bug9283() {
5575#if !UCONFIG_NO_NORMALIZATION
5576 UErrorCode status = U_ZERO_ERROR;
5577 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5578 REGEX_CHECK_STATUS;
5579 int32_t index;
5580 UChar32 c;
5581 for (index=0; ; index++) {
5582 c = supplementalsWithCaseFolding.charAt(index);
5583 if (c == -1) {
5584 break;
5585 }
5586 UnicodeString cf = UnicodeString(c).foldCase();
5587 REGEX_ASSERT(cf.length() >= 2);
5588 }
5589#endif /* #if !UCONFIG_NO_NORMALIZATION */
5590}
5591
5592
5593void RegexTest::CheckInvBufSize() {
5594 if(inv_next>=INV_BUFSIZ) {
5595 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5596 __FILE__, INV_BUFSIZ, inv_next);
5597 } else {
5598 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5599 }
5600}
5601
5602
5603void RegexTest::Bug10459() {
5604 UErrorCode status = U_ZERO_ERROR;
5605 UnicodeString patternString("(txt)");
5606 UnicodeString txtString("txt");
5607
5608 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5609 REGEX_CHECK_STATUS;
5610 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5611 REGEX_CHECK_STATUS;
5612
5613 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5614 REGEX_CHECK_STATUS;
5615
5616 uregex_setUText(icu_re, utext_txt, &status);
5617 REGEX_CHECK_STATUS;
5618
5619 // The bug was that calling uregex_group() before doing a matching operation
5620 // was causing a segfault. Only for Regular Expressions created from UText.
5621 // It should set an U_REGEX_INVALID_STATE.
5622
5623 UChar buf[100];
5624 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5625 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5626 REGEX_ASSERT(len == 0);
5627
5628 uregex_close(icu_re);
5629 utext_close(utext_pat);
5630 utext_close(utext_txt);
5631}
5632
5633void RegexTest::TestCaseInsensitiveStarters() {
5634 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5635 // become stale because of new Unicode characters.
5636 // If it is stale, rerun the generation tool
5637 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5638 // and replace the embedded data in i18n/regexcmp.cpp
5639
5640 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5641 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5642 continue;
5643 }
5644 UnicodeSet s(cp, cp);
5645 s.closeOver(USET_CASE_INSENSITIVE);
5646 UnicodeSetIterator setIter(s);
5647 while (setIter.next()) {
5648 if (!setIter.isString()) {
5649 continue;
5650 }
5651 const UnicodeString &str = setIter.getString();
5652 UChar32 firstChar = str.char32At(0);
5653 UnicodeSet starters;
5654 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5655 if (!starters.contains(cp)) {
5656 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5657 return;
5658 }
5659 }
5660 }
5661}
5662
5663
5664void RegexTest::TestBug11049() {
5665 // Original bug report: pattern with match start consisting of one of several individual characters,
5666 // and the text being matched ending with a supplementary character. find() would read past the
5667 // end of the input text when searching for potential match starting points.
5668
5669 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5670 // detect the bad read.
5671
5672 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5673 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5674
5675 // Test again with a pattern starting with a single character,
5676 // which takes a different code path than starting with an OR expression,
5677 // but with similar logic.
5678 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5679 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5680}
5681
5682// Run a single test case from TestBug11049(). Internal function.
5683void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5684 UErrorCode status = U_ZERO_ERROR;
5685 UnicodeString patternString = UnicodeString(pattern).unescape();
5686 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5687
5688 UnicodeString dataString = UnicodeString(data).unescape();
5689 UChar *exactBuffer = new UChar[dataString.length()];
5690 dataString.extract(exactBuffer, dataString.length(), status);
5691 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5692
5693 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5694 REGEX_CHECK_STATUS;
5695 matcher->reset(ut);
5696 UBool result = matcher->find();
5697 if (result != expectMatch) {
5698 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5699 __FILE__, lineNumber, expectMatch, result, pattern, data);
5700 }
5701
5702 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5703 // off-by-one on find() with match at the last code point.
5704 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5705 // because string.unescape() will only shrink it.
5706 char * utf8Buffer = new char[uprv_strlen(data)+1];
5707 u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5708 REGEX_CHECK_STATUS;
5709 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5710 REGEX_CHECK_STATUS;
5711 matcher->reset(ut);
5712 result = matcher->find();
5713 if (result != expectMatch) {
5714 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5715 __FILE__, lineNumber, expectMatch, result, pattern, data);
5716 }
5717 delete [] utf8Buffer;
5718
5719 utext_close(ut);
5720 delete [] exactBuffer;
5721}
5722
5723
5724void RegexTest::TestBug11371() {
5725 if (quick) {
5726 logln("Skipping test. Runs in exhuastive mode only.");
5727 return;
5728 }
5729 UErrorCode status = U_ZERO_ERROR;
5730 UnicodeString patternString;
5731
5732 for (int i=0; i<8000000; i++) {
5733 patternString.append(UnicodeString("()"));
5734 }
5735 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5736 if (status != U_REGEX_PATTERN_TOO_BIG) {
5737 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5738 __FILE__, __LINE__, u_errorName(status));
5739 }
5740
5741 status = U_ZERO_ERROR;
5742 patternString = "(";
5743 for (int i=0; i<20000000; i++) {
5744 patternString.append(UnicodeString("A++"));
5745 }
5746 patternString.append(UnicodeString("){0}B++"));
5747 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5748 if (status != U_REGEX_PATTERN_TOO_BIG) {
5749 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750 __FILE__, __LINE__, u_errorName(status));
5751 }
5752
5753 // Pattern with too much string data, such that string indexes overflow operand data field size
5754 // in compiled instruction.
5755 status = U_ZERO_ERROR;
5756 patternString = "";
5757 while (patternString.length() < 0x00ffffff) {
5758 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5759 }
5760 patternString.append(UnicodeString("X? trailing string"));
5761 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5762 if (status != U_REGEX_PATTERN_TOO_BIG) {
5763 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5764 __FILE__, __LINE__, u_errorName(status));
5765 }
5766}
5767
5768void RegexTest::TestBug11480() {
5769 // C API, get capture group of a group that does not participate in the match.
5770 // (Returns a zero length string, with nul termination,
5771 // indistinguishable from a group with a zero length match.)
5772
5773 UErrorCode status = U_ZERO_ERROR;
5774 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5775 REGEX_CHECK_STATUS;
5776 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5777 uregex_setText(re, text.getBuffer(), text.length(), &status);
5778 REGEX_CHECK_STATUS;
5779 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5780 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5781 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5782 REGEX_ASSERT(length == 0);
5783 REGEX_ASSERT(buf[0] == 13);
5784 REGEX_ASSERT(buf[1] == 0);
5785 REGEX_ASSERT(buf[2] == 13);
5786 uregex_close(re);
5787
5788 // UText C++ API, length of match is 0 for non-participating matches.
5789 UText ut = UTEXT_INITIALIZER;
5790 utext_openUnicodeString(&ut, &text, &status);
5791 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5792 REGEX_CHECK_STATUS;
5793 matcher.reset(&ut);
5794 REGEX_ASSERT(matcher.lookingAt(0, status));
5795
5796 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5797 int64_t groupLen = -666;
5798 UText group = UTEXT_INITIALIZER;
5799 matcher.group(1, &group, groupLen, status);
5800 REGEX_CHECK_STATUS;
5801 REGEX_ASSERT(groupLen == 1);
5802 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5803
5804 // Capture group 2, the (B), does not participate in the match.
5805 matcher.group(2, &group, groupLen, status);
5806 REGEX_CHECK_STATUS;
5807 REGEX_ASSERT(groupLen == 0);
5808 REGEX_ASSERT(matcher.start(2, status) == -1);
5809 REGEX_CHECK_STATUS;
5810}
5811
5812void RegexTest::TestBug12884() {
5813 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5814 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5815 UnicodeString text(u"hello");
5816 UErrorCode status = U_ZERO_ERROR;
5817 RegexMatcher m(pattern, text, 0, status);
5818 REGEX_CHECK_STATUS;
5819 m.setTimeLimit(5, status);
5820 m.find(status);
5821 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5822
5823 // Non-greedy loops. They take a different code path during matching.
5824 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5825 status = U_ZERO_ERROR;
5826 RegexMatcher ngM(ngPattern, text, 0, status);
5827 REGEX_CHECK_STATUS;
5828 ngM.setTimeLimit(5, status);
5829 ngM.find(status);
5830 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5831
5832 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5833 const char *text8 = reinterpret_cast<const char*>(u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5834 "carácter, sin importar la plataforma, sin importar el programa,"
5835 "sin importar el idioma.");
5836 status = U_ZERO_ERROR;
5837 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5838 REGEX_CHECK_STATUS;
5839 m.reset(ut.getAlias());
5840 m.find(status);
5841 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5842
5843 status = U_ZERO_ERROR;
5844 ngM.reset(ut.getAlias());
5845 ngM.find(status);
5846 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5847}
5848
5849// Bug 13631. A find() of a pattern with a zero length look-behind assertions
5850// can cause a read past the end of the input text.
5851// The failure is seen when running this test with Clang's Addresss Sanitizer.
5852
5853void RegexTest::TestBug13631() {
5854 const UChar *pats[] = { u"(?<!^)",
5855 u"(?<=^)",
5856 nullptr
5857 };
5858 for (const UChar **pat=pats; *pat; ++pat) {
5859 UErrorCode status = U_ZERO_ERROR;
5860 UnicodeString upat(*pat);
5861 RegexMatcher matcher(upat, 0, status);
5862 const UChar s =u'a';
5863 UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5864 REGEX_CHECK_STATUS;
5865 matcher.reset(ut);
5866 while (matcher.find()) {
5867 }
5868 utext_close(ut);
5869 }
5870}
5871
5872// Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5873// where a following group specification would be expected.
5874// Failure shows when running the test under Clang's Address Sanitizer.
5875
5876void RegexTest::TestBug13632() {
5877 UErrorCode status = U_ZERO_ERROR;
5878 URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5879 const char16_t *sourceString = u"Hello, world.";
5880 uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5881
5882 const int32_t destCap = 20;
5883 char16_t dest[destCap] = {};
5884 const char16_t replacement[] = {u'x', u'$'}; // Not nul terminated string.
5885 uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5886
5887 assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5888 uregex_close(re);
5889}
5890
5891void RegexTest::TestBug20359() {
5892 // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5893 // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5894 // Changed implementation to loop instead of recursing.
5895
5896 UnicodeString pattern;
5897 for (int i=0; i<50000; ++i) {
5898 pattern += u"\\Q\\E";
5899 }
5900 pattern += u"x";
5901
5902 UErrorCode status = U_ZERO_ERROR;
5903 LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5904 0, nullptr, &status));
5905 assertSuccess(WHERE, status);
5906
5907 // We have passed the point where the bug crashed. The following is a small sanity
5908 // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5909
5910 uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5911 assertSuccess(WHERE, status);
5912 assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5913 assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5914 assertSuccess(WHERE, status);
5915}
5916
5917
5918void RegexTest::TestBug20863() {
5919 // Test that patterns with a large number of named capture groups work correctly.
5920 //
5921 // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5922 // construction of the map from capture names to numbers, and decreasing the
5923 // default size of the map.
5924
5925 constexpr int GROUP_COUNT = 2000;
5926 std::vector<UnicodeString> groupNames;
5927 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5928 UnicodeString name;
5929 name.append(u"name");
5930 name.append(Int64ToUnicodeString(i));
5931 groupNames.push_back(name);
5932 }
5933
5934 UnicodeString patternString;
5935 for (UnicodeString name: groupNames) {
5936 patternString.append(u"(?<");
5937 patternString.append(name);
5938 patternString.append(u">.)");
5939 }
5940
5941 UErrorCode status = U_ZERO_ERROR;
5942 UParseError pe;
5943 LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5944 if (!assertSuccess(WHERE, status)) {
5945 return;
5946 }
5947
5948 for (int32_t i=0; i<GROUP_COUNT; ++i) {
5949 int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5950 if (!assertSuccess(WHERE, status)) {
5951 return;
5952 }
5953 assertEquals(WHERE, i+1, group);
5954 // Note: group 0 is the overall match; group 1 is the first separate capture group.
5955 }
5956
5957 // Verify that assignment of patterns with various combinations of named capture work.
5958 // Lazy creation of the internal named capture map changed the implementation logic here.
5959 {
5960 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5961 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5962 assertSuccess(WHERE, status);
5963 assertFalse(WHERE, *pat1 == *pat2);
5964 *pat1 = *pat2;
5965 assertTrue(WHERE, *pat1 == *pat2);
5966 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5967 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5968 assertSuccess(WHERE, status);
5969 }
5970
5971 {
5972 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5973 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5974 assertSuccess(WHERE, status);
5975 assertFalse(WHERE, *pat1 == *pat2);
5976 *pat2 = *pat1;
5977 assertTrue(WHERE, *pat1 == *pat2);
5978 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5979 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5980 status = U_ZERO_ERROR;
5981 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5982 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5983 status = U_ZERO_ERROR;
5984 }
5985
5986 {
5987 LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5988 LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5989 assertSuccess(WHERE, status);
5990 assertFalse(WHERE, *pat1 == *pat2);
5991 *pat2 = *pat1;
5992 assertTrue(WHERE, *pat1 == *pat2);
5993 assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
5994 assertSuccess(WHERE, status);
5995 assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
5996 assertSuccess(WHERE, status);
5997 assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
5998 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5999 status = U_ZERO_ERROR;
6000 assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
6001 assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6002 status = U_ZERO_ERROR;
6003 }
6004
6005}
6006
6007
6008#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */