]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/regextst.cpp
ICU-59117.0.1.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
CommitLineData
f3c0d7a5
A
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
b75a7d8f
A
3/********************************************************************
4 * COPYRIGHT:
2ca993e8 5 * Copyright (c) 2002-2016, International Business Machines Corporation and
b75a7d8f
A
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9//
10// regextst.cpp
11//
12// ICU Regular Expressions test, part of intltest.
13//
14
4388f060
A
15/*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
374ca955 25#include "intltest.h"
b75a7d8f
A
26#if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
2ca993e8
A
28#include <stdlib.h>
29#include <stdio.h>
30#include <string.h>
31
b331163b 32#include "unicode/localpointer.h"
374ca955 33#include "unicode/regex.h"
b75a7d8f
A
34#include "unicode/uchar.h"
35#include "unicode/ucnv.h"
4388f060 36#include "unicode/uniset.h"
57a6839d 37#include "unicode/uregex.h"
b331163b 38#include "unicode/usetiter.h"
729e4ab9 39#include "unicode/ustring.h"
2ca993e8
A
40#include "unicode/utext.h"
41
b75a7d8f 42#include "regextst.h"
b331163b 43#include "regexcmp.h"
b75a7d8f 44#include "uvector.h"
b75a7d8f 45#include "util.h"
b331163b 46#include "cmemory.h"
729e4ab9
A
47#include "cstring.h"
48#include "uinvchar.h"
b75a7d8f 49
729e4ab9 50#define SUPPORT_MUTATING_INPUT_STRING 0
b75a7d8f
A
51
52//---------------------------------------------------------------------------
53//
54// Test class boilerplate
55//
56//---------------------------------------------------------------------------
374ca955 57RegexTest::RegexTest()
b75a7d8f 58{
73c04bcf 59}
b75a7d8f
A
60
61
62RegexTest::~RegexTest()
63{
73c04bcf 64}
b75a7d8f
A
65
66
67
68void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
69{
70 if (exec) logln("TestSuite RegexTest: ");
f3c0d7a5
A
71 TESTCASE_AUTO_BEGIN;
72 TESTCASE_AUTO(Basic);
73 TESTCASE_AUTO(API_Match);
74 TESTCASE_AUTO(API_Replace);
75 TESTCASE_AUTO(API_Pattern);
729e4ab9 76#if !UCONFIG_NO_FILE_IO
f3c0d7a5 77 TESTCASE_AUTO(Extended);
729e4ab9 78#endif
f3c0d7a5
A
79 TESTCASE_AUTO(Errors);
80 TESTCASE_AUTO(PerlTests);
81 TESTCASE_AUTO(Callbacks);
82 TESTCASE_AUTO(FindProgressCallbacks);
83 TESTCASE_AUTO(Bug6149);
84 TESTCASE_AUTO(UTextBasic);
85 TESTCASE_AUTO(API_Match_UTF8);
86 TESTCASE_AUTO(API_Replace_UTF8);
87 TESTCASE_AUTO(API_Pattern_UTF8);
88 TESTCASE_AUTO(PerlTestsUTF8);
89 TESTCASE_AUTO(PreAllocatedUTextCAPI);
90 TESTCASE_AUTO(Bug7651);
91 TESTCASE_AUTO(Bug7740);
92 TESTCASE_AUTO(Bug8479);
93 TESTCASE_AUTO(Bug7029);
94 TESTCASE_AUTO(CheckInvBufSize);
95 TESTCASE_AUTO(Bug9283);
96 TESTCASE_AUTO(Bug10459);
97 TESTCASE_AUTO(TestCaseInsensitiveStarters);
98 TESTCASE_AUTO(TestBug11049);
99 TESTCASE_AUTO(TestBug11371);
100 TESTCASE_AUTO(TestBug11480);
101 TESTCASE_AUTO(NamedCapture);
102 TESTCASE_AUTO(NamedCaptureLimits);
103 TESTCASE_AUTO(TestBug12884);
104 TESTCASE_AUTO_END;
b75a7d8f
A
105}
106
107
729e4ab9
A
108/**
109 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
57a6839d 110 * into ASCII.
729e4ab9
A
111 * @see utext_openUTF8
112 */
113static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
114
b75a7d8f
A
115//---------------------------------------------------------------------------
116//
117// Error Checking / Reporting macros used in all of the tests.
118//
119//---------------------------------------------------------------------------
b75a7d8f 120
729e4ab9
A
121static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
122 int64_t oldIndex = utext_getNativeIndex(text);
123 utext_setNativeIndex(text, 0);
124 char *bufPtr = buf;
125 UChar32 c = utext_next32From(text, 0);
126 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
127 if (0x000020<=c && c<0x00007e) {
128 *bufPtr = c;
129 } else {
130#if 0
131 sprintf(bufPtr,"U+%04X", c);
132 bufPtr+= strlen(bufPtr)-1;
133#else
134 *bufPtr = '%';
135#endif
136 }
137 bufPtr++;
138 c = UTEXT_NEXT32(text);
139 }
140 *bufPtr = 0;
141#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
142 char *ebuf = (char*)malloc(bufLen);
143 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
144 uprv_strncpy(buf, ebuf, bufLen);
145 free((void*)ebuf);
146#endif
147 utext_setNativeIndex(text, oldIndex);
148}
149
4388f060
A
150
151static char ASSERT_BUF[1024];
152
153const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
154 if(message.length()==0) {
155 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
156 } else {
157 UnicodeString buf;
158 IntlTest::prettify(message,buf);
159 if(buf.length()==0) {
160 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
161 } else {
162 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
163 if(ASSERT_BUF[0]==0) {
164 ASSERT_BUF[0]=0;
165 for(int32_t i=0;i<buf.length();i++) {
166 UChar ch = buf[i];
167 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
168 }
169 }
170 }
171 }
172 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
173 return ASSERT_BUF;
174}
175
2ca993e8 176#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
729e4ab9
A
177
178#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
179 __FILE__, __LINE__, u_errorName(status)); return;}}
180
181#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
b75a7d8f
A
182
183#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
729e4ab9 184if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
b75a7d8f
A
185 __LINE__, u_errorName(errcode), u_errorName(status));};}
186
187#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
188 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
189
190#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
191 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
192
b331163b
A
193// expected: const char * , restricted to invariant characters.
194// actual: const UnicodeString &
195#define REGEX_ASSERT_UNISTR(expected, actual) { \
196 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
197 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
198 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
4388f060
A
199
200
201static UBool testUTextEqual(UText *uta, UText *utb) {
202 UChar32 ca = 0;
203 UChar32 cb = 0;
204 utext_setNativeIndex(uta, 0);
205 utext_setNativeIndex(utb, 0);
206 do {
207 ca = utext_next32(uta);
208 cb = utext_next32(utb);
209 if (ca != cb) {
210 break;
211 }
212 } while (ca != U_SENTINEL);
213 return ca == cb;
214}
215
216
729e4ab9
A
217/**
218 * @param expected expected text in UTF-8 (not platform) codepage
219 */
220void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
221 UErrorCode status = U_ZERO_ERROR;
222 UText expectedText = UTEXT_INITIALIZER;
223 utext_openUTF8(&expectedText, expected, -1, &status);
224 if(U_FAILURE(status)) {
225 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
226 return;
227 }
228 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
229 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
230 return;
231 }
232 utext_setNativeIndex(actual, 0);
4388f060 233 if (!testUTextEqual(&expectedText, actual)) {
729e4ab9
A
234 char buf[201 /*21*/];
235 char expectedBuf[201];
2ca993e8
A
236 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
237 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
729e4ab9
A
238 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
239 }
240 utext_close(&expectedText);
241}
242/**
243 * @param expected invariant (platform local text) input
244 */
245
246void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
247 UErrorCode status = U_ZERO_ERROR;
248 UText expectedText = UTEXT_INITIALIZER;
249 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
250 if(U_FAILURE(status)) {
251 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
252 return;
253 }
254 utext_setNativeIndex(actual, 0);
4388f060 255 if (!testUTextEqual(&expectedText, actual)) {
729e4ab9
A
256 char buf[201 /*21*/];
257 char expectedBuf[201];
2ca993e8
A
258 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
259 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
729e4ab9
A
260 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
261 }
262 utext_close(&expectedText);
263}
264
265/**
57a6839d 266 * Assumes utf-8 input
729e4ab9
A
267 */
268#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
269/**
57a6839d 270 * Assumes Invariant input
729e4ab9
A
271 */
272#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
b75a7d8f 273
4388f060
A
274/**
275 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
276 * passed into utext_openUTF8. An error will be given if
277 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
57a6839d 278 */
4388f060
A
279
280#define INV_BUFSIZ 2048 /* increase this if too small */
281
51004dcb 282static int64_t inv_next=0;
4388f060
A
283
284#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
285static char inv_buf[INV_BUFSIZ];
286#endif
287
288static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
289 if(length==-1) length=strlen(inv);
290#if U_CHARSET_FAMILY==U_ASCII_FAMILY
291 inv_next+=length;
292 return utext_openUTF8(ut, inv, length, status);
293#else
294 if(inv_next+length+1>INV_BUFSIZ) {
295 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
296 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
297 *status = U_MEMORY_ALLOCATION_ERROR;
298 return NULL;
299 }
300
301 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
302 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
303 inv_next+=length;
304
305#if 0
306 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
307#endif
308
309 return utext_openUTF8(ut, (const char*)buf, length, status);
310#endif
311}
312
b75a7d8f
A
313
314//---------------------------------------------------------------------------
315//
316// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
317// for the LookingAt() and Match() functions.
318//
319// usage:
320// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
321//
322// The expected results are UBool - TRUE or FALSE.
323// The input text is unescaped. The pattern is not.
374ca955 324//
b75a7d8f
A
325//
326//---------------------------------------------------------------------------
327
729e4ab9 328#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
b75a7d8f 329
46f4442e
A
330UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
331 const UnicodeString pattern(pat, -1, US_INV);
332 const UnicodeString inputText(text, -1, US_INV);
b75a7d8f
A
333 UErrorCode status = U_ZERO_ERROR;
334 UParseError pe;
335 RegexPattern *REPattern = NULL;
336 RegexMatcher *REMatcher = NULL;
337 UBool retVal = TRUE;
338
46f4442e 339 UnicodeString patString(pat, -1, US_INV);
b75a7d8f
A
340 REPattern = RegexPattern::compile(patString, 0, pe, status);
341 if (U_FAILURE(status)) {
729e4ab9 342 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
b75a7d8f
A
343 line, u_errorName(status));
344 return FALSE;
345 }
57a6839d 346 if (line==376) { REPattern->dumpPattern();}
b75a7d8f
A
347
348 UnicodeString inputString(inputText);
349 UnicodeString unEscapedInput = inputString.unescape();
350 REMatcher = REPattern->matcher(unEscapedInput, status);
351 if (U_FAILURE(status)) {
352 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
353 line, u_errorName(status));
354 return FALSE;
355 }
374ca955 356
b75a7d8f
A
357 UBool actualmatch;
358 actualmatch = REMatcher->lookingAt(status);
359 if (U_FAILURE(status)) {
360 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
361 line, u_errorName(status));
362 retVal = FALSE;
363 }
364 if (actualmatch != looking) {
365 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
366 retVal = FALSE;
367 }
368
369 status = U_ZERO_ERROR;
370 actualmatch = REMatcher->matches(status);
371 if (U_FAILURE(status)) {
372 errln("RegexTest failure in matches() at line %d. Status = %s\n",
373 line, u_errorName(status));
374 retVal = FALSE;
375 }
376 if (actualmatch != match) {
377 errln("RegexTest: wrong return from matches() at line %d.\n", line);
378 retVal = FALSE;
379 }
380
381 if (retVal == FALSE) {
57a6839d 382 REPattern->dumpPattern();
b75a7d8f
A
383 }
384
385 delete REPattern;
386 delete REMatcher;
387 return retVal;
388}
374ca955 389
b75a7d8f 390
729e4ab9
A
391UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
392 UText pattern = UTEXT_INITIALIZER;
393 int32_t inputUTF8Length;
394 char *textChars = NULL;
395 UText inputText = UTEXT_INITIALIZER;
396 UErrorCode status = U_ZERO_ERROR;
397 UParseError pe;
398 RegexPattern *REPattern = NULL;
399 RegexMatcher *REMatcher = NULL;
400 UBool retVal = TRUE;
401
402 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
403 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
404 if (U_FAILURE(status)) {
405 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
406 line, u_errorName(status));
407 return FALSE;
408 }
57a6839d 409
729e4ab9
A
410 UnicodeString inputString(text, -1, US_INV);
411 UnicodeString unEscapedInput = inputString.unescape();
412 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
413 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
57a6839d 414
729e4ab9
A
415 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
416 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
417 // UTF-8 does not allow unpaired surrogates, so this could actually happen
418 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
419 return TRUE; // not a failure of the Regex engine
420 }
421 status = U_ZERO_ERROR; // buffer overflow
422 textChars = new char[inputUTF8Length+1];
423 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
424 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
57a6839d 425
4388f060 426 REMatcher = &REPattern->matcher(status)->reset(&inputText);
729e4ab9
A
427 if (U_FAILURE(status)) {
428 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
429 line, u_errorName(status));
430 return FALSE;
431 }
432
433 UBool actualmatch;
434 actualmatch = REMatcher->lookingAt(status);
435 if (U_FAILURE(status)) {
436 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
437 line, u_errorName(status));
438 retVal = FALSE;
439 }
440 if (actualmatch != looking) {
441 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
442 retVal = FALSE;
443 }
444
445 status = U_ZERO_ERROR;
446 actualmatch = REMatcher->matches(status);
447 if (U_FAILURE(status)) {
448 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
449 line, u_errorName(status));
450 retVal = FALSE;
451 }
452 if (actualmatch != match) {
453 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
454 retVal = FALSE;
455 }
456
457 if (retVal == FALSE) {
57a6839d 458 REPattern->dumpPattern();
729e4ab9
A
459 }
460
461 delete REPattern;
462 delete REMatcher;
463 utext_close(&inputText);
464 utext_close(&pattern);
465 delete[] textChars;
466 return retVal;
467}
b75a7d8f
A
468
469
b75a7d8f
A
470
471//---------------------------------------------------------------------------
472//
473// REGEX_ERR Macro + invocation function to simplify writing tests
474// regex tests for incorrect patterns
475//
476// usage:
477// REGEX_ERR("pattern", expected error line, column, expected status);
478//
479//---------------------------------------------------------------------------
480#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
481
482void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
46f4442e 483 UErrorCode expectedStatus, int32_t line) {
b75a7d8f
A
484 UnicodeString pattern(pat);
485
486 UErrorCode status = U_ZERO_ERROR;
487 UParseError pe;
488 RegexPattern *callerPattern = NULL;
489
490 //
491 // Compile the caller's pattern
492 //
493 UnicodeString patString(pat);
494 callerPattern = RegexPattern::compile(patString, 0, pe, status);
495 if (status != expectedStatus) {
729e4ab9 496 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
b75a7d8f
A
497 } else {
498 if (status != U_ZERO_ERROR) {
499 if (pe.line != errLine || pe.offset != errCol) {
500 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
501 line, errLine, errCol, pe.line, pe.offset);
502 }
503 }
504 }
505
506 delete callerPattern;
729e4ab9
A
507
508 //
509 // Compile again, using a UTF-8-based UText
510 //
511 UText patternText = UTEXT_INITIALIZER;
512 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
513 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
514 if (status != expectedStatus) {
515 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
516 } else {
517 if (status != U_ZERO_ERROR) {
518 if (pe.line != errLine || pe.offset != errCol) {
519 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
520 line, errLine, errCol, pe.line, pe.offset);
521 }
522 }
523 }
57a6839d 524
729e4ab9
A
525 delete callerPattern;
526 utext_close(&patternText);
b75a7d8f
A
527}
528
529
530
531//---------------------------------------------------------------------------
532//
533// Basic Check for basic functionality of regex pattern matching.
534// Avoid the use of REGEX_FIND test macro, which has
535// substantial dependencies on basic Regex functionality.
536//
537//---------------------------------------------------------------------------
538void RegexTest::Basic() {
539
540
541//
542// Debug - slide failing test cases early
543//
544#if 0
545 {
546 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
547 UParseError pe;
548 UErrorCode status = U_ZERO_ERROR;
4388f060
A
549 RegexPattern *pattern;
550 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
57a6839d 551 pattern->dumpPattern();
4388f060
A
552 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
553 UBool result = m->find();
554 printf("result = %d\n", result);
555 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
b75a7d8f
A
556 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
557 }
558 exit(1);
559#endif
560
561
562 //
563 // Pattern with parentheses
564 //
565 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
566 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
567 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
568
569 //
570 // Patterns with *
571 //
572 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
573 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
574 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
575 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
576 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
577
578 REGEX_TESTLM("a*", "", TRUE, TRUE);
579 REGEX_TESTLM("a*", "b", TRUE, FALSE);
580
581
582 //
583 // Patterns with "."
584 //
585 REGEX_TESTLM(".", "abc", TRUE, FALSE);
586 REGEX_TESTLM("...", "abc", TRUE, TRUE);
587 REGEX_TESTLM("....", "abc", FALSE, FALSE);
588 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
589 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
590 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
591 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
592 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
593
594 //
595 // Patterns with * applied to chars at end of literal string
596 //
597 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
598 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
599
600 //
601 // Supplemental chars match as single chars, not a pair of surrogates.
602 //
603 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
604 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
605 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
606
607
608 //
609 // UnicodeSets in the pattern
610 //
611 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
612 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
613 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
614 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
615 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
616 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
617
618 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
619 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
620 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
621 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
622 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
623
624 //
625 // OR operator in patterns
626 //
627 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
628 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
629 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
630 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
631
632 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
633 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
634 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
635 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
636 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
637 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
638
639 //
640 // +
641 //
642 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
643 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
644 REGEX_TESTLM("b+", "", FALSE, FALSE);
645 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
646 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
647 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
648
649 //
650 // ?
651 //
652 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
653 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
654 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
655 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
656 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
657 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
658 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
659 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
660 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
661
662 //
663 // Escape sequences that become single literal chars, handled internally
664 // by ICU's Unescape.
665 //
374ca955 666
b75a7d8f
A
667 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
668 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
374ca955
A
669 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
670 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
b75a7d8f
A
671 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
672 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
673 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
674 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
374ca955
A
675 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
676 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
b75a7d8f
A
677
678 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
679 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
680
681 // Escape of special chars in patterns
374ca955 682 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
729e4ab9 683}
b75a7d8f
A
684
685
729e4ab9
A
686//---------------------------------------------------------------------------
687//
688// UTextBasic Check for quirks that are specific to the UText
689// implementation.
690//
691//---------------------------------------------------------------------------
692void RegexTest::UTextBasic() {
693 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
694 UErrorCode status = U_ZERO_ERROR;
695 UText pattern = UTEXT_INITIALIZER;
696 utext_openUTF8(&pattern, str_abc, -1, &status);
697 RegexMatcher matcher(&pattern, 0, status);
698 REGEX_CHECK_STATUS;
57a6839d 699
729e4ab9
A
700 UText input = UTEXT_INITIALIZER;
701 utext_openUTF8(&input, str_abc, -1, &status);
702 REGEX_CHECK_STATUS;
703 matcher.reset(&input);
704 REGEX_CHECK_STATUS;
705 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
57a6839d 706
729e4ab9
A
707 matcher.reset(matcher.inputText());
708 REGEX_CHECK_STATUS;
709 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
57a6839d 710
729e4ab9
A
711 utext_close(&pattern);
712 utext_close(&input);
73c04bcf 713}
b75a7d8f
A
714
715
716//---------------------------------------------------------------------------
717//
374ca955 718// API_Match Test that the API for class RegexMatcher
b75a7d8f
A
719// is present and nominally working, but excluding functions
720// implementing replace operations.
721//
722//---------------------------------------------------------------------------
723void RegexTest::API_Match() {
724 UParseError pe;
725 UErrorCode status=U_ZERO_ERROR;
726 int32_t flags = 0;
727
728 //
729 // Debug - slide failing test cases early
730 //
731#if 0
732 {
733 }
734 return;
735#endif
736
737 //
738 // Simple pattern compilation
739 //
740 {
741 UnicodeString re("abc");
742 RegexPattern *pat2;
743 pat2 = RegexPattern::compile(re, flags, pe, status);
744 REGEX_CHECK_STATUS;
374ca955 745
b75a7d8f
A
746 UnicodeString inStr1 = "abcdef this is a test";
747 UnicodeString instr2 = "not abc";
748 UnicodeString empty = "";
374ca955
A
749
750
b75a7d8f
A
751 //
752 // Matcher creation and reset.
753 //
754 RegexMatcher *m1 = pat2->matcher(inStr1, status);
755 REGEX_CHECK_STATUS;
374ca955 756 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f
A
757 REGEX_ASSERT(m1->input() == inStr1);
758 m1->reset(instr2);
759 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
760 REGEX_ASSERT(m1->input() == instr2);
761 m1->reset(inStr1);
762 REGEX_ASSERT(m1->input() == inStr1);
763 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
764 m1->reset(empty);
765 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
766 REGEX_ASSERT(m1->input() == empty);
767 REGEX_ASSERT(&m1->pattern() == pat2);
374ca955
A
768
769 //
770 // reset(pos, status)
771 //
772 m1->reset(inStr1);
773 m1->reset(4, status);
774 REGEX_CHECK_STATUS;
775 REGEX_ASSERT(m1->input() == inStr1);
776 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
777
778 m1->reset(-1, status);
779 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
780 status = U_ZERO_ERROR;
781
782 m1->reset(0, status);
783 REGEX_CHECK_STATUS;
784 status = U_ZERO_ERROR;
785
786 int32_t len = m1->input().length();
787 m1->reset(len-1, status);
788 REGEX_CHECK_STATUS;
789 status = U_ZERO_ERROR;
790
791 m1->reset(len, status);
729e4ab9
A
792 REGEX_CHECK_STATUS;
793 status = U_ZERO_ERROR;
794
795 m1->reset(len+1, status);
374ca955
A
796 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
797 status = U_ZERO_ERROR;
798
799 //
800 // match(pos, status)
801 //
802 m1->reset(instr2);
803 REGEX_ASSERT(m1->matches(4, status) == TRUE);
804 m1->reset();
805 REGEX_ASSERT(m1->matches(3, status) == FALSE);
806 m1->reset();
807 REGEX_ASSERT(m1->matches(5, status) == FALSE);
808 REGEX_ASSERT(m1->matches(4, status) == TRUE);
809 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
810 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
811
812 // Match() at end of string should fail, but should not
813 // be an error.
814 status = U_ZERO_ERROR;
815 len = m1->input().length();
816 REGEX_ASSERT(m1->matches(len, status) == FALSE);
817 REGEX_CHECK_STATUS;
818
819 // Match beyond end of string should fail with an error.
820 status = U_ZERO_ERROR;
821 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
822 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
823
824 // Successful match at end of string.
825 {
826 status = U_ZERO_ERROR;
827 RegexMatcher m("A?", 0, status); // will match zero length string.
828 REGEX_CHECK_STATUS;
829 m.reset(inStr1);
830 len = inStr1.length();
831 REGEX_ASSERT(m.matches(len, status) == TRUE);
832 REGEX_CHECK_STATUS;
833 m.reset(empty);
834 REGEX_ASSERT(m.matches(0, status) == TRUE);
835 REGEX_CHECK_STATUS;
836 }
837
838
839 //
840 // lookingAt(pos, status)
841 //
842 status = U_ZERO_ERROR;
843 m1->reset(instr2); // "not abc"
844 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
845 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
846 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
847 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
848 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
849 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
850 status = U_ZERO_ERROR;
851 len = m1->input().length();
852 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
853 REGEX_CHECK_STATUS;
854 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
855 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
856
b75a7d8f
A
857 delete m1;
858 delete pat2;
859 }
860
861
862 //
374ca955 863 // Capture Group.
b75a7d8f
A
864 // RegexMatcher::start();
865 // RegexMatcher::end();
866 // RegexMatcher::groupCount();
867 //
868 {
869 int32_t flags=0;
870 UParseError pe;
871 UErrorCode status=U_ZERO_ERROR;
872
873 UnicodeString re("01(23(45)67)(.*)");
874 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
875 REGEX_CHECK_STATUS;
876 UnicodeString data = "0123456789";
374ca955 877
b75a7d8f
A
878 RegexMatcher *matcher = pat->matcher(data, status);
879 REGEX_CHECK_STATUS;
374ca955 880 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
46f4442e
A
881 static const int32_t matchStarts[] = {0, 2, 4, 8};
882 static const int32_t matchEnds[] = {10, 8, 6, 10};
883 int32_t i;
b75a7d8f
A
884 for (i=0; i<4; i++) {
885 int32_t actualStart = matcher->start(i, status);
886 REGEX_CHECK_STATUS;
887 if (actualStart != matchStarts[i]) {
888 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
889 __LINE__, i, matchStarts[i], actualStart);
890 }
891 int32_t actualEnd = matcher->end(i, status);
892 REGEX_CHECK_STATUS;
893 if (actualEnd != matchEnds[i]) {
894 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
895 __LINE__, i, matchEnds[i], actualEnd);
896 }
897 }
898
899 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
900 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
901
902 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
903 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
904 matcher->reset();
905 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
906
907 matcher->lookingAt(status);
908 REGEX_ASSERT(matcher->group(status) == "0123456789");
909 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
910 REGEX_ASSERT(matcher->group(1, status) == "234567" );
911 REGEX_ASSERT(matcher->group(2, status) == "45" );
912 REGEX_ASSERT(matcher->group(3, status) == "89" );
913 REGEX_CHECK_STATUS;
914 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
915 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
916 matcher->reset();
917 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
918
919 delete matcher;
920 delete pat;
921
922 }
923
924 //
925 // find
926 //
927 {
928 int32_t flags=0;
929 UParseError pe;
930 UErrorCode status=U_ZERO_ERROR;
931
932 UnicodeString re("abc");
933 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
934 REGEX_CHECK_STATUS;
935 UnicodeString data = ".abc..abc...abc..";
936 // 012345678901234567
374ca955 937
b75a7d8f
A
938 RegexMatcher *matcher = pat->matcher(data, status);
939 REGEX_CHECK_STATUS;
940 REGEX_ASSERT(matcher->find());
941 REGEX_ASSERT(matcher->start(status) == 1);
942 REGEX_ASSERT(matcher->find());
943 REGEX_ASSERT(matcher->start(status) == 6);
944 REGEX_ASSERT(matcher->find());
945 REGEX_ASSERT(matcher->start(status) == 12);
946 REGEX_ASSERT(matcher->find() == FALSE);
947 REGEX_ASSERT(matcher->find() == FALSE);
948
949 matcher->reset();
950 REGEX_ASSERT(matcher->find());
951 REGEX_ASSERT(matcher->start(status) == 1);
952
953 REGEX_ASSERT(matcher->find(0, status));
954 REGEX_ASSERT(matcher->start(status) == 1);
955 REGEX_ASSERT(matcher->find(1, status));
956 REGEX_ASSERT(matcher->start(status) == 1);
957 REGEX_ASSERT(matcher->find(2, status));
958 REGEX_ASSERT(matcher->start(status) == 6);
959 REGEX_ASSERT(matcher->find(12, status));
960 REGEX_ASSERT(matcher->start(status) == 12);
961 REGEX_ASSERT(matcher->find(13, status) == FALSE);
962 REGEX_ASSERT(matcher->find(16, status) == FALSE);
374ca955 963 REGEX_ASSERT(matcher->find(17, status) == FALSE);
b75a7d8f 964 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
b75a7d8f 965
374ca955 966 status = U_ZERO_ERROR;
b75a7d8f 967 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
968 status = U_ZERO_ERROR;
969 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f
A
970
971 REGEX_ASSERT(matcher->groupCount() == 0);
972
973 delete matcher;
974 delete pat;
975 }
976
977
978 //
979 // find, with \G in pattern (true if at the end of a previous match).
980 //
981 {
982 int32_t flags=0;
983 UParseError pe;
984 UErrorCode status=U_ZERO_ERROR;
985
46f4442e 986 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
b75a7d8f
A
987 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
988 REGEX_CHECK_STATUS;
989 UnicodeString data = ".abcabc.abc..";
990 // 012345678901234567
374ca955 991
b75a7d8f
A
992 RegexMatcher *matcher = pat->matcher(data, status);
993 REGEX_CHECK_STATUS;
994 REGEX_ASSERT(matcher->find());
995 REGEX_ASSERT(matcher->start(status) == 0);
374ca955 996 REGEX_ASSERT(matcher->start(1, status) == -1);
b75a7d8f
A
997 REGEX_ASSERT(matcher->start(2, status) == 1);
998
999 REGEX_ASSERT(matcher->find());
1000 REGEX_ASSERT(matcher->start(status) == 4);
374ca955 1001 REGEX_ASSERT(matcher->start(1, status) == 4);
b75a7d8f
A
1002 REGEX_ASSERT(matcher->start(2, status) == -1);
1003 REGEX_CHECK_STATUS;
1004
1005 delete matcher;
1006 delete pat;
1007 }
1008
374ca955
A
1009 //
1010 // find with zero length matches, match position should bump ahead
1011 // to prevent loops.
1012 //
1013 {
46f4442e 1014 int32_t i;
374ca955
A
1015 UErrorCode status=U_ZERO_ERROR;
1016 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1017 // using an always-true look-ahead.
1018 REGEX_CHECK_STATUS;
1019 UnicodeString s(" ");
1020 m.reset(s);
1021 for (i=0; ; i++) {
1022 if (m.find() == FALSE) {
1023 break;
1024 }
1025 REGEX_ASSERT(m.start(status) == i);
1026 REGEX_ASSERT(m.end(status) == i);
1027 }
1028 REGEX_ASSERT(i==5);
1029
1030 // Check that the bump goes over surrogate pairs OK
46f4442e 1031 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
374ca955
A
1032 s = s.unescape();
1033 m.reset(s);
1034 for (i=0; ; i+=2) {
1035 if (m.find() == FALSE) {
1036 break;
1037 }
1038 REGEX_ASSERT(m.start(status) == i);
1039 REGEX_ASSERT(m.end(status) == i);
1040 }
1041 REGEX_ASSERT(i==10);
1042 }
1043 {
1044 // find() loop breaking test.
1045 // with pattern of /.?/, should see a series of one char matches, then a single
1046 // match of zero length at the end of the input string.
46f4442e 1047 int32_t i;
374ca955
A
1048 UErrorCode status=U_ZERO_ERROR;
1049 RegexMatcher m(".?", 0, status);
1050 REGEX_CHECK_STATUS;
1051 UnicodeString s(" ");
1052 m.reset(s);
1053 for (i=0; ; i++) {
1054 if (m.find() == FALSE) {
1055 break;
1056 }
1057 REGEX_ASSERT(m.start(status) == i);
1058 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1059 }
1060 REGEX_ASSERT(i==5);
1061 }
1062
1063
b75a7d8f
A
1064 //
1065 // Matchers with no input string behave as if they had an empty input string.
1066 //
1067
1068 {
1069 UErrorCode status = U_ZERO_ERROR;
1070 RegexMatcher m(".?", 0, status);
1071 REGEX_CHECK_STATUS;
1072 REGEX_ASSERT(m.find());
1073 REGEX_ASSERT(m.start(status) == 0);
1074 REGEX_ASSERT(m.input() == "");
1075 }
1076 {
1077 UErrorCode status = U_ZERO_ERROR;
1078 RegexPattern *p = RegexPattern::compile(".", 0, status);
1079 RegexMatcher *m = p->matcher(status);
1080 REGEX_CHECK_STATUS;
374ca955 1081
b75a7d8f
A
1082 REGEX_ASSERT(m->find() == FALSE);
1083 REGEX_ASSERT(m->input() == "");
1084 delete m;
1085 delete p;
1086 }
57a6839d 1087
46f4442e
A
1088 //
1089 // Regions
1090 //
1091 {
1092 UErrorCode status = U_ZERO_ERROR;
1093 UnicodeString testString("This is test data");
1094 RegexMatcher m(".*", testString, 0, status);
1095 REGEX_CHECK_STATUS;
1096 REGEX_ASSERT(m.regionStart() == 0);
1097 REGEX_ASSERT(m.regionEnd() == testString.length());
1098 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1099 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 1100
46f4442e
A
1101 m.region(2,4, status);
1102 REGEX_CHECK_STATUS;
1103 REGEX_ASSERT(m.matches(status));
1104 REGEX_ASSERT(m.start(status)==2);
1105 REGEX_ASSERT(m.end(status)==4);
1106 REGEX_CHECK_STATUS;
57a6839d 1107
46f4442e
A
1108 m.reset();
1109 REGEX_ASSERT(m.regionStart() == 0);
1110 REGEX_ASSERT(m.regionEnd() == testString.length());
57a6839d 1111
46f4442e
A
1112 UnicodeString shorterString("short");
1113 m.reset(shorterString);
1114 REGEX_ASSERT(m.regionStart() == 0);
1115 REGEX_ASSERT(m.regionEnd() == shorterString.length());
57a6839d 1116
46f4442e
A
1117 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1118 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1119 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1120 REGEX_ASSERT(&m == &m.reset());
1121 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
57a6839d 1122
46f4442e
A
1123 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1124 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1125 REGEX_ASSERT(&m == &m.reset());
1126 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 1127
46f4442e
A
1128 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1130 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1131 REGEX_ASSERT(&m == &m.reset());
1132 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1133
1134 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1135 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1136 REGEX_ASSERT(&m == &m.reset());
1137 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
57a6839d 1138
46f4442e 1139 }
57a6839d 1140
46f4442e
A
1141 //
1142 // hitEnd() and requireEnd()
1143 //
1144 {
1145 UErrorCode status = U_ZERO_ERROR;
1146 UnicodeString testString("aabb");
1147 RegexMatcher m1(".*", testString, 0, status);
1148 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1149 REGEX_ASSERT(m1.hitEnd() == TRUE);
1150 REGEX_ASSERT(m1.requireEnd() == FALSE);
1151 REGEX_CHECK_STATUS;
57a6839d 1152
46f4442e
A
1153 status = U_ZERO_ERROR;
1154 RegexMatcher m2("a*", testString, 0, status);
1155 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1156 REGEX_ASSERT(m2.hitEnd() == FALSE);
1157 REGEX_ASSERT(m2.requireEnd() == FALSE);
1158 REGEX_CHECK_STATUS;
1159
1160 status = U_ZERO_ERROR;
1161 RegexMatcher m3(".*$", testString, 0, status);
1162 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1163 REGEX_ASSERT(m3.hitEnd() == TRUE);
1164 REGEX_ASSERT(m3.requireEnd() == TRUE);
1165 REGEX_CHECK_STATUS;
1166 }
1167
b75a7d8f 1168
374ca955
A
1169 //
1170 // Compilation error on reset with UChar *
1171 // These were a hazard that people were stumbling over with runtime errors.
1172 // Changed them to compiler errors by adding private methods that more closely
1173 // matched the incorrect use of the functions.
1174 //
1175#if 0
1176 {
1177 UErrorCode status = U_ZERO_ERROR;
1178 UChar ucharString[20];
1179 RegexMatcher m(".", 0, status);
1180 m.reset(ucharString); // should not compile.
1181
1182 RegexPattern *p = RegexPattern::compile(".", 0, status);
1183 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1184
1185 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1186 }
1187#endif
1188
46f4442e 1189 //
57a6839d 1190 // Time Outs.
46f4442e
A
1191 // Note: These tests will need to be changed when the regexp engine is
1192 // able to detect and cut short the exponential time behavior on
1193 // this type of match.
1194 //
1195 {
1196 UErrorCode status = U_ZERO_ERROR;
1197 // Enough 'a's in the string to cause the match to time out.
1198 // (Each on additonal 'a' doubles the time)
1199 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1200 RegexMatcher matcher("(a+)+b", testString, 0, status);
1201 REGEX_CHECK_STATUS;
1202 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1203 matcher.setTimeLimit(100, status);
1204 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1205 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1206 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1207 }
1208 {
1209 UErrorCode status = U_ZERO_ERROR;
1210 // Few enough 'a's to slip in under the time limit.
1211 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1212 RegexMatcher matcher("(a+)+b", testString, 0, status);
1213 REGEX_CHECK_STATUS;
1214 matcher.setTimeLimit(100, status);
1215 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1216 REGEX_CHECK_STATUS;
1217 }
57a6839d 1218
46f4442e
A
1219 //
1220 // Stack Limits
1221 //
1222 {
1223 UErrorCode status = U_ZERO_ERROR;
729e4ab9 1224 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
57a6839d 1225
46f4442e
A
1226 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1227 // of the '+', and makes the stack frames larger.
1228 RegexMatcher matcher("(A)+A$", testString, 0, status);
57a6839d 1229
46f4442e
A
1230 // With the default stack, this match should fail to run
1231 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1232 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
57a6839d 1233
46f4442e
A
1234 // With unlimited stack, it should run
1235 status = U_ZERO_ERROR;
1236 matcher.setStackLimit(0, status);
1237 REGEX_CHECK_STATUS;
1238 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1239 REGEX_CHECK_STATUS;
1240 REGEX_ASSERT(matcher.getStackLimit() == 0);
1241
1242 // With a limited stack, it the match should fail
1243 status = U_ZERO_ERROR;
1244 matcher.setStackLimit(10000, status);
1245 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1247 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1248 }
57a6839d 1249
46f4442e
A
1250 // A pattern that doesn't save state should work with
1251 // a minimal sized stack
1252 {
1253 UErrorCode status = U_ZERO_ERROR;
1254 UnicodeString testString = "abc";
1255 RegexMatcher matcher("abc", testString, 0, status);
1256 REGEX_CHECK_STATUS;
1257 matcher.setStackLimit(30, status);
1258 REGEX_CHECK_STATUS;
1259 REGEX_ASSERT(matcher.matches(status) == TRUE);
1260 REGEX_CHECK_STATUS;
1261 REGEX_ASSERT(matcher.getStackLimit() == 30);
57a6839d 1262
46f4442e
A
1263 // Negative stack sizes should fail
1264 status = U_ZERO_ERROR;
1265 matcher.setStackLimit(1000, status);
1266 REGEX_CHECK_STATUS;
1267 matcher.setStackLimit(-1, status);
1268 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1269 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1270 }
57a6839d 1271
46f4442e 1272
b75a7d8f
A
1273}
1274
1275
1276
1277
1278
1279
1280//---------------------------------------------------------------------------
1281//
374ca955 1282// API_Replace API test for class RegexMatcher, testing the
b75a7d8f
A
1283// Replace family of functions.
1284//
1285//---------------------------------------------------------------------------
1286void RegexTest::API_Replace() {
1287 //
1288 // Replace
1289 //
1290 int32_t flags=0;
1291 UParseError pe;
1292 UErrorCode status=U_ZERO_ERROR;
374ca955 1293
b75a7d8f
A
1294 UnicodeString re("abc");
1295 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1296 REGEX_CHECK_STATUS;
1297 UnicodeString data = ".abc..abc...abc..";
1298 // 012345678901234567
1299 RegexMatcher *matcher = pat->matcher(data, status);
374ca955 1300
b75a7d8f
A
1301 //
1302 // Plain vanilla matches.
1303 //
1304 UnicodeString dest;
1305 dest = matcher->replaceFirst("yz", status);
1306 REGEX_CHECK_STATUS;
1307 REGEX_ASSERT(dest == ".yz..abc...abc..");
374ca955 1308
b75a7d8f
A
1309 dest = matcher->replaceAll("yz", status);
1310 REGEX_CHECK_STATUS;
1311 REGEX_ASSERT(dest == ".yz..yz...yz..");
374ca955 1312
b75a7d8f
A
1313 //
1314 // Plain vanilla non-matches.
1315 //
1316 UnicodeString d2 = ".abx..abx...abx..";
1317 matcher->reset(d2);
1318 dest = matcher->replaceFirst("yz", status);
1319 REGEX_CHECK_STATUS;
1320 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1321
b75a7d8f
A
1322 dest = matcher->replaceAll("yz", status);
1323 REGEX_CHECK_STATUS;
1324 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1325
b75a7d8f
A
1326 //
1327 // Empty source string
1328 //
1329 UnicodeString d3 = "";
1330 matcher->reset(d3);
1331 dest = matcher->replaceFirst("yz", status);
1332 REGEX_CHECK_STATUS;
1333 REGEX_ASSERT(dest == "");
374ca955 1334
b75a7d8f
A
1335 dest = matcher->replaceAll("yz", status);
1336 REGEX_CHECK_STATUS;
1337 REGEX_ASSERT(dest == "");
374ca955 1338
b75a7d8f
A
1339 //
1340 // Empty substitution string
1341 //
1342 matcher->reset(data); // ".abc..abc...abc.."
1343 dest = matcher->replaceFirst("", status);
1344 REGEX_CHECK_STATUS;
1345 REGEX_ASSERT(dest == "...abc...abc..");
374ca955 1346
b75a7d8f
A
1347 dest = matcher->replaceAll("", status);
1348 REGEX_CHECK_STATUS;
1349 REGEX_ASSERT(dest == "........");
374ca955 1350
b75a7d8f
A
1351 //
1352 // match whole string
1353 //
1354 UnicodeString d4 = "abc";
374ca955 1355 matcher->reset(d4);
b75a7d8f
A
1356 dest = matcher->replaceFirst("xyz", status);
1357 REGEX_CHECK_STATUS;
1358 REGEX_ASSERT(dest == "xyz");
374ca955 1359
b75a7d8f
A
1360 dest = matcher->replaceAll("xyz", status);
1361 REGEX_CHECK_STATUS;
1362 REGEX_ASSERT(dest == "xyz");
374ca955 1363
b75a7d8f
A
1364 //
1365 // Capture Group, simple case
1366 //
1367 UnicodeString re2("a(..)");
1368 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1369 REGEX_CHECK_STATUS;
1370 UnicodeString d5 = "abcdefg";
1371 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1372 REGEX_CHECK_STATUS;
1373 dest = matcher2->replaceFirst("$1$1", status);
1374 REGEX_CHECK_STATUS;
1375 REGEX_ASSERT(dest == "bcbcdefg");
1376
46f4442e 1377 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
b75a7d8f
A
1378 REGEX_CHECK_STATUS;
1379 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1380
1381 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
b331163b
A
1382 REGEX_ASSERT(U_FAILURE(status));
1383 status = U_ZERO_ERROR;
b75a7d8f 1384
46f4442e 1385 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
b75a7d8f
A
1386 replacement = replacement.unescape();
1387 dest = matcher2->replaceFirst(replacement, status);
1388 REGEX_CHECK_STATUS;
1389 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
374ca955 1390
b75a7d8f 1391 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
1392
1393
1394 //
1395 // Replacement String with \u hex escapes
1396 //
1397 {
1398 UnicodeString src = "abc 1 abc 2 abc 3";
46f4442e 1399 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
374ca955
A
1400 matcher->reset(src);
1401 UnicodeString result = matcher->replaceAll(substitute, status);
1402 REGEX_CHECK_STATUS;
1403 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1404 }
1405 {
1406 UnicodeString src = "abc !";
46f4442e 1407 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
374ca955
A
1408 matcher->reset(src);
1409 UnicodeString result = matcher->replaceAll(substitute, status);
1410 REGEX_CHECK_STATUS;
1411 UnicodeString expected = UnicodeString("--");
1412 expected.append((UChar32)0x10000);
1413 expected.append("-- !");
1414 REGEX_ASSERT(result == expected);
1415 }
b75a7d8f 1416 // TODO: need more through testing of capture substitutions.
374ca955
A
1417
1418 // Bug 4057
1419 //
1420 {
1421 status = U_ZERO_ERROR;
1422 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1423 RegexMatcher m("ss(.*?)ee", 0, status);
1424 REGEX_CHECK_STATUS;
1425 UnicodeString result;
1426
1427 // Multiple finds do NOT bump up the previous appendReplacement postion.
1428 m.reset(s);
1429 m.find();
1430 m.find();
1431 m.appendReplacement(result, "ooh", status);
1432 REGEX_CHECK_STATUS;
1433 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1434
1435 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1436 status = U_ZERO_ERROR;
1437 result.truncate(0);
1438 m.reset(10, status);
1439 m.find();
1440 m.find();
1441 m.appendReplacement(result, "ooh", status);
1442 REGEX_CHECK_STATUS;
1443 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1444
1445 // find() at interior of string, appendReplacemnt still starts at beginning.
1446 status = U_ZERO_ERROR;
1447 result.truncate(0);
1448 m.reset();
1449 m.find(10, status);
1450 m.find();
1451 m.appendReplacement(result, "ooh", status);
1452 REGEX_CHECK_STATUS;
1453 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1454
1455 m.appendTail(result);
1456 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1457
1458 }
1459
b75a7d8f
A
1460 delete matcher2;
1461 delete pat2;
1462 delete matcher;
1463 delete pat;
1464}
1465
1466
1467//---------------------------------------------------------------------------
1468//
1469// API_Pattern Test that the API for class RegexPattern is
1470// present and nominally working.
1471//
1472//---------------------------------------------------------------------------
1473void RegexTest::API_Pattern() {
1474 RegexPattern pata; // Test default constructor to not crash.
1475 RegexPattern patb;
1476
1477 REGEX_ASSERT(pata == patb);
1478 REGEX_ASSERT(pata == pata);
1479
1480 UnicodeString re1("abc[a-l][m-z]");
1481 UnicodeString re2("def");
1482 UErrorCode status = U_ZERO_ERROR;
1483 UParseError pe;
1484
1485 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1486 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1487 REGEX_CHECK_STATUS;
1488 REGEX_ASSERT(*pat1 == *pat1);
1489 REGEX_ASSERT(*pat1 != pata);
1490
1491 // Assign
1492 patb = *pat1;
1493 REGEX_ASSERT(patb == *pat1);
1494
1495 // Copy Construct
1496 RegexPattern patc(*pat1);
1497 REGEX_ASSERT(patc == *pat1);
1498 REGEX_ASSERT(patb == patc);
1499 REGEX_ASSERT(pat1 != pat2);
1500 patb = *pat2;
1501 REGEX_ASSERT(patb != patc);
1502 REGEX_ASSERT(patb == *pat2);
1503
1504 // Compile with no flags.
1505 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1506 REGEX_ASSERT(*pat1a == *pat1);
1507
1508 REGEX_ASSERT(pat1a->flags() == 0);
374ca955 1509
b75a7d8f
A
1510 // Compile with different flags should be not equal
1511 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1512 REGEX_CHECK_STATUS;
1513
1514 REGEX_ASSERT(*pat1b != *pat1a);
1515 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1516 REGEX_ASSERT(pat1a->flags() == 0);
1517 delete pat1b;
b75a7d8f
A
1518
1519 // clone
1520 RegexPattern *pat1c = pat1->clone();
1521 REGEX_ASSERT(*pat1c == *pat1);
1522 REGEX_ASSERT(*pat1c != *pat2);
1523
b75a7d8f
A
1524 delete pat1c;
1525 delete pat1a;
1526 delete pat1;
1527 delete pat2;
1528
1529
374ca955
A
1530 //
1531 // Verify that a matcher created from a cloned pattern works.
1532 // (Jitterbug 3423)
1533 //
1534 {
1535 UErrorCode status = U_ZERO_ERROR;
46f4442e 1536 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
374ca955
A
1537 RegexPattern *pClone = pSource->clone();
1538 delete pSource;
1539 RegexMatcher *mFromClone = pClone->matcher(status);
1540 REGEX_CHECK_STATUS;
1541 UnicodeString s = "Hello World";
1542 mFromClone->reset(s);
1543 REGEX_ASSERT(mFromClone->find() == TRUE);
1544 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1545 REGEX_ASSERT(mFromClone->find() == TRUE);
1546 REGEX_ASSERT(mFromClone->group(status) == "World");
1547 REGEX_ASSERT(mFromClone->find() == FALSE);
1548 delete mFromClone;
1549 delete pClone;
1550 }
1551
b75a7d8f
A
1552 //
1553 // matches convenience API
1554 //
1555 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1556 REGEX_CHECK_STATUS;
1557 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1558 REGEX_CHECK_STATUS;
1559 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1560 REGEX_CHECK_STATUS;
1561 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1562 REGEX_CHECK_STATUS;
1563 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1564 REGEX_CHECK_STATUS;
1565 status = U_INDEX_OUTOFBOUNDS_ERROR;
1566 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1567 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1568
1569
1570 //
1571 // Split()
1572 //
1573 status = U_ZERO_ERROR;
1574 pat1 = RegexPattern::compile(" +", pe, status);
1575 REGEX_CHECK_STATUS;
1576 UnicodeString fields[10];
1577
1578 int32_t n;
1579 n = pat1->split("Now is the time", fields, 10, status);
1580 REGEX_CHECK_STATUS;
1581 REGEX_ASSERT(n==4);
1582 REGEX_ASSERT(fields[0]=="Now");
1583 REGEX_ASSERT(fields[1]=="is");
1584 REGEX_ASSERT(fields[2]=="the");
1585 REGEX_ASSERT(fields[3]=="time");
1586 REGEX_ASSERT(fields[4]=="");
1587
1588 n = pat1->split("Now is the time", fields, 2, status);
1589 REGEX_CHECK_STATUS;
1590 REGEX_ASSERT(n==2);
1591 REGEX_ASSERT(fields[0]=="Now");
1592 REGEX_ASSERT(fields[1]=="is the time");
1593 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1594
1595 fields[1] = "*";
1596 status = U_ZERO_ERROR;
1597 n = pat1->split("Now is the time", fields, 1, status);
1598 REGEX_CHECK_STATUS;
1599 REGEX_ASSERT(n==1);
1600 REGEX_ASSERT(fields[0]=="Now is the time");
1601 REGEX_ASSERT(fields[1]=="*");
1602 status = U_ZERO_ERROR;
1603
1604 n = pat1->split(" Now is the time ", fields, 10, status);
1605 REGEX_CHECK_STATUS;
4388f060 1606 REGEX_ASSERT(n==6);
b75a7d8f
A
1607 REGEX_ASSERT(fields[0]=="");
1608 REGEX_ASSERT(fields[1]=="Now");
1609 REGEX_ASSERT(fields[2]=="is");
1610 REGEX_ASSERT(fields[3]=="the");
1611 REGEX_ASSERT(fields[4]=="time");
1612 REGEX_ASSERT(fields[5]=="");
1613
1614 n = pat1->split(" ", fields, 10, status);
1615 REGEX_CHECK_STATUS;
4388f060 1616 REGEX_ASSERT(n==2);
b75a7d8f 1617 REGEX_ASSERT(fields[0]=="");
4388f060 1618 REGEX_ASSERT(fields[1]=="");
b75a7d8f
A
1619
1620 fields[0] = "foo";
1621 n = pat1->split("", fields, 10, status);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(n==0);
1624 REGEX_ASSERT(fields[0]=="foo");
1625
1626 delete pat1;
1627
1628 // split, with a pattern with (capture)
46f4442e 1629 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
b75a7d8f
A
1630 REGEX_CHECK_STATUS;
1631
1632 status = U_ZERO_ERROR;
1633 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1634 REGEX_CHECK_STATUS;
4388f060 1635 REGEX_ASSERT(n==7);
b75a7d8f
A
1636 REGEX_ASSERT(fields[0]=="");
1637 REGEX_ASSERT(fields[1]=="a");
1638 REGEX_ASSERT(fields[2]=="Now is ");
1639 REGEX_ASSERT(fields[3]=="b");
1640 REGEX_ASSERT(fields[4]=="the time");
1641 REGEX_ASSERT(fields[5]=="c");
1642 REGEX_ASSERT(fields[6]=="");
1643 REGEX_ASSERT(status==U_ZERO_ERROR);
1644
1645 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1646 REGEX_CHECK_STATUS;
4388f060 1647 REGEX_ASSERT(n==7);
b75a7d8f
A
1648 REGEX_ASSERT(fields[0]==" ");
1649 REGEX_ASSERT(fields[1]=="a");
1650 REGEX_ASSERT(fields[2]=="Now is ");
1651 REGEX_ASSERT(fields[3]=="b");
1652 REGEX_ASSERT(fields[4]=="the time");
1653 REGEX_ASSERT(fields[5]=="c");
1654 REGEX_ASSERT(fields[6]=="");
1655
1656 status = U_ZERO_ERROR;
1657 fields[6] = "foo";
1658 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1659 REGEX_CHECK_STATUS;
1660 REGEX_ASSERT(n==6);
1661 REGEX_ASSERT(fields[0]==" ");
1662 REGEX_ASSERT(fields[1]=="a");
1663 REGEX_ASSERT(fields[2]=="Now is ");
1664 REGEX_ASSERT(fields[3]=="b");
1665 REGEX_ASSERT(fields[4]=="the time");
4388f060 1666 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
b75a7d8f
A
1667 REGEX_ASSERT(fields[6]=="foo");
1668
1669 status = U_ZERO_ERROR;
1670 fields[5] = "foo";
1671 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1672 REGEX_CHECK_STATUS;
1673 REGEX_ASSERT(n==5);
1674 REGEX_ASSERT(fields[0]==" ");
1675 REGEX_ASSERT(fields[1]=="a");
1676 REGEX_ASSERT(fields[2]=="Now is ");
1677 REGEX_ASSERT(fields[3]=="b");
1678 REGEX_ASSERT(fields[4]=="the time<c>");
1679 REGEX_ASSERT(fields[5]=="foo");
1680
1681 status = U_ZERO_ERROR;
1682 fields[5] = "foo";
1683 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1684 REGEX_CHECK_STATUS;
1685 REGEX_ASSERT(n==5);
1686 REGEX_ASSERT(fields[0]==" ");
1687 REGEX_ASSERT(fields[1]=="a");
1688 REGEX_ASSERT(fields[2]=="Now is ");
1689 REGEX_ASSERT(fields[3]=="b");
1690 REGEX_ASSERT(fields[4]=="the time");
1691 REGEX_ASSERT(fields[5]=="foo");
1692
1693 status = U_ZERO_ERROR;
1694 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1695 REGEX_CHECK_STATUS;
1696 REGEX_ASSERT(n==4);
1697 REGEX_ASSERT(fields[0]==" ");
1698 REGEX_ASSERT(fields[1]=="a");
1699 REGEX_ASSERT(fields[2]=="Now is ");
1700 REGEX_ASSERT(fields[3]=="the time<c>");
1701 status = U_ZERO_ERROR;
1702 delete pat1;
1703
1704 pat1 = RegexPattern::compile("([-,])", pe, status);
1705 REGEX_CHECK_STATUS;
1706 n = pat1->split("1-10,20", fields, 10, status);
1707 REGEX_CHECK_STATUS;
1708 REGEX_ASSERT(n==5);
1709 REGEX_ASSERT(fields[0]=="1");
1710 REGEX_ASSERT(fields[1]=="-");
1711 REGEX_ASSERT(fields[2]=="10");
1712 REGEX_ASSERT(fields[3]==",");
1713 REGEX_ASSERT(fields[4]=="20");
1714 delete pat1;
1715
4388f060
A
1716 // Test split of string with empty trailing fields
1717 pat1 = RegexPattern::compile(",", pe, status);
1718 REGEX_CHECK_STATUS;
1719 n = pat1->split("a,b,c,", fields, 10, status);
1720 REGEX_CHECK_STATUS;
1721 REGEX_ASSERT(n==4);
1722 REGEX_ASSERT(fields[0]=="a");
1723 REGEX_ASSERT(fields[1]=="b");
1724 REGEX_ASSERT(fields[2]=="c");
1725 REGEX_ASSERT(fields[3]=="");
1726
1727 n = pat1->split("a,,,", fields, 10, status);
1728 REGEX_CHECK_STATUS;
1729 REGEX_ASSERT(n==4);
1730 REGEX_ASSERT(fields[0]=="a");
1731 REGEX_ASSERT(fields[1]=="");
1732 REGEX_ASSERT(fields[2]=="");
1733 REGEX_ASSERT(fields[3]=="");
1734 delete pat1;
1735
1736 // Split Separator with zero length match.
1737 pat1 = RegexPattern::compile(":?", pe, status);
1738 REGEX_CHECK_STATUS;
1739 n = pat1->split("abc", fields, 10, status);
1740 REGEX_CHECK_STATUS;
1741 REGEX_ASSERT(n==5);
1742 REGEX_ASSERT(fields[0]=="");
1743 REGEX_ASSERT(fields[1]=="a");
1744 REGEX_ASSERT(fields[2]=="b");
1745 REGEX_ASSERT(fields[3]=="c");
1746 REGEX_ASSERT(fields[4]=="");
1747
1748 delete pat1;
b75a7d8f
A
1749
1750 //
1751 // RegexPattern::pattern()
1752 //
1753 pat1 = new RegexPattern();
1754 REGEX_ASSERT(pat1->pattern() == "");
1755 delete pat1;
1756
1757 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1758 REGEX_CHECK_STATUS;
1759 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1760 delete pat1;
1761
1762
1763 //
1764 // classID functions
1765 //
1766 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1767 REGEX_CHECK_STATUS;
1768 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1769 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
374ca955
A
1770 UnicodeString Hello("Hello, world.");
1771 RegexMatcher *m = pat1->matcher(Hello, status);
b75a7d8f
A
1772 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1773 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1774 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1775 delete m;
1776 delete pat1;
1777
1778}
1779
1780//---------------------------------------------------------------------------
1781//
729e4ab9
A
1782// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1783// is present and working, but excluding functions
1784// implementing replace operations.
b75a7d8f
A
1785//
1786//---------------------------------------------------------------------------
729e4ab9
A
1787void RegexTest::API_Match_UTF8() {
1788 UParseError pe;
1789 UErrorCode status=U_ZERO_ERROR;
1790 int32_t flags = 0;
b75a7d8f
A
1791
1792 //
729e4ab9 1793 // Debug - slide failing test cases early
b75a7d8f 1794 //
729e4ab9
A
1795#if 0
1796 {
374ca955 1797 }
729e4ab9
A
1798 return;
1799#endif
b75a7d8f
A
1800
1801 //
729e4ab9 1802 // Simple pattern compilation
b75a7d8f 1803 //
729e4ab9
A
1804 {
1805 UText re = UTEXT_INITIALIZER;
1806 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
4388f060 1807 REGEX_VERBOSE_TEXT(&re);
729e4ab9
A
1808 RegexPattern *pat2;
1809 pat2 = RegexPattern::compile(&re, flags, pe, status);
1810 REGEX_CHECK_STATUS;
b75a7d8f 1811
729e4ab9
A
1812 UText input1 = UTEXT_INITIALIZER;
1813 UText input2 = UTEXT_INITIALIZER;
1814 UText empty = UTEXT_INITIALIZER;
1815 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1816 REGEX_VERBOSE_TEXT(&input1);
1817 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1818 REGEX_VERBOSE_TEXT(&input2);
1819 utext_openUChars(&empty, NULL, 0, &status);
57a6839d 1820
729e4ab9
A
1821 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1822 int32_t input2Len = strlen("not abc");
b75a7d8f 1823
b75a7d8f 1824
729e4ab9
A
1825 //
1826 // Matcher creation and reset.
1827 //
4388f060 1828 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
729e4ab9
A
1829 REGEX_CHECK_STATUS;
1830 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1831 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1832 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1833 m1->reset(&input2);
1834 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1835 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1836 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1837 m1->reset(&input1);
1838 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1839 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1840 m1->reset(&empty);
1841 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1842 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
b75a7d8f 1843
729e4ab9
A
1844 //
1845 // reset(pos, status)
1846 //
1847 m1->reset(&input1);
1848 m1->reset(4, status);
1849 REGEX_CHECK_STATUS;
1850 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1851 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f 1852
729e4ab9
A
1853 m1->reset(-1, status);
1854 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f 1855 status = U_ZERO_ERROR;
b75a7d8f 1856
729e4ab9
A
1857 m1->reset(0, status);
1858 REGEX_CHECK_STATUS;
1859 status = U_ZERO_ERROR;
b75a7d8f 1860
729e4ab9
A
1861 m1->reset(input1Len-1, status);
1862 REGEX_CHECK_STATUS;
1863 status = U_ZERO_ERROR;
1864
1865 m1->reset(input1Len, status);
1866 REGEX_CHECK_STATUS;
1867 status = U_ZERO_ERROR;
1868
1869 m1->reset(input1Len+1, status);
1870 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1871 status = U_ZERO_ERROR;
b75a7d8f
A
1872
1873 //
729e4ab9 1874 // match(pos, status)
b75a7d8f 1875 //
729e4ab9
A
1876 m1->reset(&input2);
1877 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1878 m1->reset();
1879 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1880 m1->reset();
1881 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1882 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1883 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885
1886 // Match() at end of string should fail, but should not
1887 // be an error.
1888 status = U_ZERO_ERROR;
1889 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1890 REGEX_CHECK_STATUS;
1891
1892 // Match beyond end of string should fail with an error.
1893 status = U_ZERO_ERROR;
1894 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1895 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1896
1897 // Successful match at end of string.
1898 {
1899 status = U_ZERO_ERROR;
1900 RegexMatcher m("A?", 0, status); // will match zero length string.
1901 REGEX_CHECK_STATUS;
1902 m.reset(&input1);
1903 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1904 REGEX_CHECK_STATUS;
1905 m.reset(&empty);
1906 REGEX_ASSERT(m.matches(0, status) == TRUE);
1907 REGEX_CHECK_STATUS;
b75a7d8f
A
1908 }
1909
1910
1911 //
729e4ab9 1912 // lookingAt(pos, status)
b75a7d8f 1913 //
729e4ab9
A
1914 status = U_ZERO_ERROR;
1915 m1->reset(&input2); // "not abc"
1916 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1917 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1918 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1919 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1920 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1921 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1922 status = U_ZERO_ERROR;
1923 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1924 REGEX_CHECK_STATUS;
1925 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1926 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1927
1928 delete m1;
1929 delete pat2;
57a6839d 1930
729e4ab9
A
1931 utext_close(&re);
1932 utext_close(&input1);
1933 utext_close(&input2);
1934 utext_close(&empty);
1935 }
1936
1937
1938 //
1939 // Capture Group.
1940 // RegexMatcher::start();
1941 // RegexMatcher::end();
1942 // RegexMatcher::groupCount();
1943 //
1944 {
1945 int32_t flags=0;
1946 UParseError pe;
1947 UErrorCode status=U_ZERO_ERROR;
1948 UText re=UTEXT_INITIALIZER;
1949 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1950 utext_openUTF8(&re, str_01234567_pat, -1, &status);
57a6839d 1951
729e4ab9
A
1952 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1953 REGEX_CHECK_STATUS;
57a6839d 1954
729e4ab9
A
1955 UText input = UTEXT_INITIALIZER;
1956 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1957 utext_openUTF8(&input, str_0123456789, -1, &status);
1958
4388f060 1959 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
1960 REGEX_CHECK_STATUS;
1961 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1962 static const int32_t matchStarts[] = {0, 2, 4, 8};
1963 static const int32_t matchEnds[] = {10, 8, 6, 10};
1964 int32_t i;
1965 for (i=0; i<4; i++) {
1966 int32_t actualStart = matcher->start(i, status);
1967 REGEX_CHECK_STATUS;
1968 if (actualStart != matchStarts[i]) {
1969 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1970 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1971 }
1972 int32_t actualEnd = matcher->end(i, status);
1973 REGEX_CHECK_STATUS;
1974 if (actualEnd != matchEnds[i]) {
1975 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1976 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1977 }
1978 }
1979
1980 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1981 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1982
1983 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1984 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1985 matcher->reset();
1986 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1987
1988 matcher->lookingAt(status);
57a6839d 1989
729e4ab9
A
1990 UnicodeString dest;
1991 UText destText = UTEXT_INITIALIZER;
1992 utext_openUnicodeString(&destText, &dest, &status);
1993 UText *result;
1994 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
b331163b 1995 // Test shallow-clone API
729e4ab9
A
1996 int64_t group_len;
1997 result = matcher->group((UText *)NULL, group_len, status);
1998 REGEX_CHECK_STATUS;
1999 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2000 utext_close(result);
2001 result = matcher->group(0, &destText, group_len, status);
2002 REGEX_CHECK_STATUS;
2003 REGEX_ASSERT(result == &destText);
2004 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2005 // destText is now immutable, reopen it
2006 utext_close(&destText);
2007 utext_openUnicodeString(&destText, &dest, &status);
57a6839d 2008
b331163b
A
2009 int64_t length;
2010 result = matcher->group(0, NULL, length, status);
729e4ab9
A
2011 REGEX_CHECK_STATUS;
2012 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2013 utext_close(result);
b331163b 2014 result = matcher->group(0, &destText, length, status);
729e4ab9
A
2015 REGEX_CHECK_STATUS;
2016 REGEX_ASSERT(result == &destText);
b331163b
A
2017 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2018 REGEX_ASSERT(length == 10);
2019 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
57a6839d 2020
b331163b
A
2021 // Capture Group 1 == "234567"
2022 result = matcher->group(1, NULL, length, status);
729e4ab9 2023 REGEX_CHECK_STATUS;
b331163b
A
2024 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2025 REGEX_ASSERT(length == 6);
2026 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
729e4ab9 2027 utext_close(result);
b331163b
A
2028
2029 result = matcher->group(1, &destText, length, status);
729e4ab9
A
2030 REGEX_CHECK_STATUS;
2031 REGEX_ASSERT(result == &destText);
b331163b
A
2032 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2033 REGEX_ASSERT(length == 6);
2034 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2035 utext_close(result);
57a6839d 2036
b331163b
A
2037 // Capture Group 2 == "45"
2038 result = matcher->group(2, NULL, length, status);
729e4ab9 2039 REGEX_CHECK_STATUS;
b331163b
A
2040 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2041 REGEX_ASSERT(length == 2);
2042 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
729e4ab9 2043 utext_close(result);
b331163b
A
2044
2045 result = matcher->group(2, &destText, length, status);
729e4ab9
A
2046 REGEX_CHECK_STATUS;
2047 REGEX_ASSERT(result == &destText);
b331163b
A
2048 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2049 REGEX_ASSERT(length == 2);
2050 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2051 utext_close(result);
57a6839d 2052
b331163b
A
2053 // Capture Group 3 == "89"
2054 result = matcher->group(3, NULL, length, status);
729e4ab9 2055 REGEX_CHECK_STATUS;
b331163b
A
2056 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2057 REGEX_ASSERT(length == 2);
2058 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
729e4ab9 2059 utext_close(result);
b331163b
A
2060
2061 result = matcher->group(3, &destText, length, status);
729e4ab9
A
2062 REGEX_CHECK_STATUS;
2063 REGEX_ASSERT(result == &destText);
b331163b
A
2064 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2065 REGEX_ASSERT(length == 2);
2066 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2067 utext_close(result);
729e4ab9 2068
b331163b
A
2069 // Capture Group number out of range.
2070 status = U_ZERO_ERROR;
729e4ab9 2071 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
b331163b 2072 status = U_ZERO_ERROR;
729e4ab9 2073 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
b331163b 2074 status = U_ZERO_ERROR;
729e4ab9
A
2075 matcher->reset();
2076 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2077
2078 delete matcher;
2079 delete pat;
57a6839d 2080
729e4ab9
A
2081 utext_close(&destText);
2082 utext_close(&input);
2083 utext_close(&re);
2084 }
2085
2086 //
2087 // find
2088 //
2089 {
2090 int32_t flags=0;
2091 UParseError pe;
2092 UErrorCode status=U_ZERO_ERROR;
2093 UText re=UTEXT_INITIALIZER;
2094 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2095 utext_openUTF8(&re, str_abc, -1, &status);
2096
2097 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2098 REGEX_CHECK_STATUS;
2099 UText input = UTEXT_INITIALIZER;
2100 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2101 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2102 // 012345678901234567
2103
4388f060 2104 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
2105 REGEX_CHECK_STATUS;
2106 REGEX_ASSERT(matcher->find());
2107 REGEX_ASSERT(matcher->start(status) == 1);
2108 REGEX_ASSERT(matcher->find());
2109 REGEX_ASSERT(matcher->start(status) == 6);
2110 REGEX_ASSERT(matcher->find());
2111 REGEX_ASSERT(matcher->start(status) == 12);
2112 REGEX_ASSERT(matcher->find() == FALSE);
2113 REGEX_ASSERT(matcher->find() == FALSE);
2114
2115 matcher->reset();
2116 REGEX_ASSERT(matcher->find());
2117 REGEX_ASSERT(matcher->start(status) == 1);
2118
2119 REGEX_ASSERT(matcher->find(0, status));
2120 REGEX_ASSERT(matcher->start(status) == 1);
2121 REGEX_ASSERT(matcher->find(1, status));
2122 REGEX_ASSERT(matcher->start(status) == 1);
2123 REGEX_ASSERT(matcher->find(2, status));
2124 REGEX_ASSERT(matcher->start(status) == 6);
2125 REGEX_ASSERT(matcher->find(12, status));
2126 REGEX_ASSERT(matcher->start(status) == 12);
2127 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2128 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2129 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2130 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2131
2132 status = U_ZERO_ERROR;
2133 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136
2137 REGEX_ASSERT(matcher->groupCount() == 0);
2138
2139 delete matcher;
2140 delete pat;
57a6839d 2141
729e4ab9
A
2142 utext_close(&input);
2143 utext_close(&re);
2144 }
2145
2146
2147 //
2148 // find, with \G in pattern (true if at the end of a previous match).
2149 //
2150 {
2151 int32_t flags=0;
2152 UParseError pe;
2153 UErrorCode status=U_ZERO_ERROR;
2154 UText re=UTEXT_INITIALIZER;
2155 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2156 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2157
2158 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
57a6839d 2159
729e4ab9
A
2160 REGEX_CHECK_STATUS;
2161 UText input = UTEXT_INITIALIZER;
2162 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2163 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164 // 012345678901234567
2165
4388f060 2166 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
2167 REGEX_CHECK_STATUS;
2168 REGEX_ASSERT(matcher->find());
2169 REGEX_ASSERT(matcher->start(status) == 0);
2170 REGEX_ASSERT(matcher->start(1, status) == -1);
2171 REGEX_ASSERT(matcher->start(2, status) == 1);
2172
2173 REGEX_ASSERT(matcher->find());
2174 REGEX_ASSERT(matcher->start(status) == 4);
2175 REGEX_ASSERT(matcher->start(1, status) == 4);
2176 REGEX_ASSERT(matcher->start(2, status) == -1);
2177 REGEX_CHECK_STATUS;
2178
2179 delete matcher;
2180 delete pat;
57a6839d 2181
729e4ab9
A
2182 utext_close(&input);
2183 utext_close(&re);
2184 }
2185
2186 //
2187 // find with zero length matches, match position should bump ahead
2188 // to prevent loops.
2189 //
2190 {
2191 int32_t i;
2192 UErrorCode status=U_ZERO_ERROR;
2193 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2194 // using an always-true look-ahead.
2195 REGEX_CHECK_STATUS;
2196 UText s = UTEXT_INITIALIZER;
2197 utext_openUTF8(&s, " ", -1, &status);
2198 m.reset(&s);
2199 for (i=0; ; i++) {
2200 if (m.find() == FALSE) {
2201 break;
2202 }
2203 REGEX_ASSERT(m.start(status) == i);
2204 REGEX_ASSERT(m.end(status) == i);
2205 }
2206 REGEX_ASSERT(i==5);
2207
2208 // Check that the bump goes over characters outside the BMP OK
2209 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2210 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2211 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2212 m.reset(&s);
2213 for (i=0; ; i+=4) {
2214 if (m.find() == FALSE) {
2215 break;
2216 }
2217 REGEX_ASSERT(m.start(status) == i);
2218 REGEX_ASSERT(m.end(status) == i);
2219 }
2220 REGEX_ASSERT(i==20);
57a6839d 2221
729e4ab9
A
2222 utext_close(&s);
2223 }
2224 {
2225 // find() loop breaking test.
2226 // with pattern of /.?/, should see a series of one char matches, then a single
2227 // match of zero length at the end of the input string.
2228 int32_t i;
2229 UErrorCode status=U_ZERO_ERROR;
2230 RegexMatcher m(".?", 0, status);
2231 REGEX_CHECK_STATUS;
2232 UText s = UTEXT_INITIALIZER;
2233 utext_openUTF8(&s, " ", -1, &status);
2234 m.reset(&s);
2235 for (i=0; ; i++) {
2236 if (m.find() == FALSE) {
2237 break;
2238 }
2239 REGEX_ASSERT(m.start(status) == i);
2240 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2241 }
2242 REGEX_ASSERT(i==5);
57a6839d 2243
729e4ab9
A
2244 utext_close(&s);
2245 }
2246
2247
2248 //
2249 // Matchers with no input string behave as if they had an empty input string.
2250 //
2251
2252 {
2253 UErrorCode status = U_ZERO_ERROR;
2254 RegexMatcher m(".?", 0, status);
2255 REGEX_CHECK_STATUS;
2256 REGEX_ASSERT(m.find());
2257 REGEX_ASSERT(m.start(status) == 0);
2258 REGEX_ASSERT(m.input() == "");
2259 }
2260 {
2261 UErrorCode status = U_ZERO_ERROR;
2262 RegexPattern *p = RegexPattern::compile(".", 0, status);
2263 RegexMatcher *m = p->matcher(status);
2264 REGEX_CHECK_STATUS;
2265
2266 REGEX_ASSERT(m->find() == FALSE);
2267 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2268 delete m;
2269 delete p;
2270 }
57a6839d 2271
729e4ab9
A
2272 //
2273 // Regions
2274 //
2275 {
2276 UErrorCode status = U_ZERO_ERROR;
2277 UText testPattern = UTEXT_INITIALIZER;
2278 UText testText = UTEXT_INITIALIZER;
2279 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2280 REGEX_VERBOSE_TEXT(&testPattern);
2281 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2282 REGEX_VERBOSE_TEXT(&testText);
57a6839d 2283
729e4ab9
A
2284 RegexMatcher m(&testPattern, &testText, 0, status);
2285 REGEX_CHECK_STATUS;
2286 REGEX_ASSERT(m.regionStart() == 0);
2287 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2288 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2289 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 2290
729e4ab9
A
2291 m.region(2,4, status);
2292 REGEX_CHECK_STATUS;
2293 REGEX_ASSERT(m.matches(status));
2294 REGEX_ASSERT(m.start(status)==2);
2295 REGEX_ASSERT(m.end(status)==4);
2296 REGEX_CHECK_STATUS;
57a6839d 2297
729e4ab9
A
2298 m.reset();
2299 REGEX_ASSERT(m.regionStart() == 0);
2300 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
57a6839d 2301
729e4ab9
A
2302 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2303 REGEX_VERBOSE_TEXT(&testText);
2304 m.reset(&testText);
2305 REGEX_ASSERT(m.regionStart() == 0);
2306 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
57a6839d 2307
729e4ab9
A
2308 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2309 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2310 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2311 REGEX_ASSERT(&m == &m.reset());
2312 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
57a6839d 2313
729e4ab9
A
2314 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2315 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2316 REGEX_ASSERT(&m == &m.reset());
2317 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 2318
729e4ab9
A
2319 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2320 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2321 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2322 REGEX_ASSERT(&m == &m.reset());
2323 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2324
2325 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2326 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2327 REGEX_ASSERT(&m == &m.reset());
2328 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
57a6839d 2329
729e4ab9
A
2330 utext_close(&testText);
2331 utext_close(&testPattern);
2332 }
57a6839d 2333
729e4ab9
A
2334 //
2335 // hitEnd() and requireEnd()
2336 //
2337 {
2338 UErrorCode status = U_ZERO_ERROR;
2339 UText testPattern = UTEXT_INITIALIZER;
2340 UText testText = UTEXT_INITIALIZER;
2341 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2342 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2343 utext_openUTF8(&testPattern, str_, -1, &status);
2344 utext_openUTF8(&testText, str_aabb, -1, &status);
57a6839d 2345
729e4ab9
A
2346 RegexMatcher m1(&testPattern, &testText, 0, status);
2347 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2348 REGEX_ASSERT(m1.hitEnd() == TRUE);
2349 REGEX_ASSERT(m1.requireEnd() == FALSE);
2350 REGEX_CHECK_STATUS;
57a6839d 2351
729e4ab9
A
2352 status = U_ZERO_ERROR;
2353 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2354 utext_openUTF8(&testPattern, str_a, -1, &status);
2355 RegexMatcher m2(&testPattern, &testText, 0, status);
2356 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2357 REGEX_ASSERT(m2.hitEnd() == FALSE);
2358 REGEX_ASSERT(m2.requireEnd() == FALSE);
2359 REGEX_CHECK_STATUS;
2360
2361 status = U_ZERO_ERROR;
2362 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2363 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2364 RegexMatcher m3(&testPattern, &testText, 0, status);
2365 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2366 REGEX_ASSERT(m3.hitEnd() == TRUE);
2367 REGEX_ASSERT(m3.requireEnd() == TRUE);
2368 REGEX_CHECK_STATUS;
57a6839d 2369
729e4ab9
A
2370 utext_close(&testText);
2371 utext_close(&testPattern);
2372 }
2373}
2374
2375
2376//---------------------------------------------------------------------------
2377//
2378// API_Replace_UTF8 API test for class RegexMatcher, testing the
2379// Replace family of functions.
2380//
2381//---------------------------------------------------------------------------
2382void RegexTest::API_Replace_UTF8() {
2383 //
2384 // Replace
2385 //
2386 int32_t flags=0;
2387 UParseError pe;
2388 UErrorCode status=U_ZERO_ERROR;
2389
2390 UText re=UTEXT_INITIALIZER;
2391 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2392 REGEX_VERBOSE_TEXT(&re);
2393 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2394 REGEX_CHECK_STATUS;
57a6839d 2395
729e4ab9
A
2396 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2397 // 012345678901234567
2398 UText dataText = UTEXT_INITIALIZER;
2399 utext_openUTF8(&dataText, data, -1, &status);
2400 REGEX_CHECK_STATUS;
2401 REGEX_VERBOSE_TEXT(&dataText);
4388f060 2402 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
729e4ab9
A
2403
2404 //
2405 // Plain vanilla matches.
2406 //
2407 UnicodeString dest;
2408 UText destText = UTEXT_INITIALIZER;
2409 utext_openUnicodeString(&destText, &dest, &status);
2410 UText *result;
57a6839d 2411
729e4ab9 2412 UText replText = UTEXT_INITIALIZER;
57a6839d 2413
729e4ab9
A
2414 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2415 utext_openUTF8(&replText, str_yz, -1, &status);
2416 REGEX_VERBOSE_TEXT(&replText);
2417 result = matcher->replaceFirst(&replText, NULL, status);
2418 REGEX_CHECK_STATUS;
2419 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2420 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2421 utext_close(result);
2422 result = matcher->replaceFirst(&replText, &destText, status);
2423 REGEX_CHECK_STATUS;
2424 REGEX_ASSERT(result == &destText);
2425 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426
2427 result = matcher->replaceAll(&replText, NULL, status);
2428 REGEX_CHECK_STATUS;
2429 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2431 utext_close(result);
2432
2433 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2434 result = matcher->replaceAll(&replText, &destText, status);
2435 REGEX_CHECK_STATUS;
2436 REGEX_ASSERT(result == &destText);
2437 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2438
2439 //
2440 // Plain vanilla non-matches.
2441 //
2442 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2443 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2444 matcher->reset(&dataText);
57a6839d 2445
729e4ab9
A
2446 result = matcher->replaceFirst(&replText, NULL, status);
2447 REGEX_CHECK_STATUS;
2448 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2449 utext_close(result);
2450 result = matcher->replaceFirst(&replText, &destText, status);
2451 REGEX_CHECK_STATUS;
2452 REGEX_ASSERT(result == &destText);
2453 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454
2455 result = matcher->replaceAll(&replText, NULL, status);
2456 REGEX_CHECK_STATUS;
2457 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2458 utext_close(result);
2459 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2460 result = matcher->replaceAll(&replText, &destText, status);
2461 REGEX_CHECK_STATUS;
2462 REGEX_ASSERT(result == &destText);
2463 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2464
2465 //
2466 // Empty source string
2467 //
2468 utext_openUTF8(&dataText, NULL, 0, &status);
2469 matcher->reset(&dataText);
57a6839d 2470
729e4ab9
A
2471 result = matcher->replaceFirst(&replText, NULL, status);
2472 REGEX_CHECK_STATUS;
2473 REGEX_ASSERT_UTEXT_UTF8("", result);
2474 utext_close(result);
2475 result = matcher->replaceFirst(&replText, &destText, status);
2476 REGEX_CHECK_STATUS;
2477 REGEX_ASSERT(result == &destText);
2478 REGEX_ASSERT_UTEXT_UTF8("", result);
2479
2480 result = matcher->replaceAll(&replText, NULL, status);
2481 REGEX_CHECK_STATUS;
2482 REGEX_ASSERT_UTEXT_UTF8("", result);
2483 utext_close(result);
2484 result = matcher->replaceAll(&replText, &destText, status);
2485 REGEX_CHECK_STATUS;
2486 REGEX_ASSERT(result == &destText);
2487 REGEX_ASSERT_UTEXT_UTF8("", result);
2488
2489 //
2490 // Empty substitution string
2491 //
2492 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2493 matcher->reset(&dataText);
57a6839d 2494
729e4ab9
A
2495 utext_openUTF8(&replText, NULL, 0, &status);
2496 result = matcher->replaceFirst(&replText, NULL, status);
2497 REGEX_CHECK_STATUS;
2498 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2499 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2500 utext_close(result);
2501 result = matcher->replaceFirst(&replText, &destText, status);
2502 REGEX_CHECK_STATUS;
2503 REGEX_ASSERT(result == &destText);
2504 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505
2506 result = matcher->replaceAll(&replText, NULL, status);
2507 REGEX_CHECK_STATUS;
2508 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2509 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2510 utext_close(result);
2511 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2512 result = matcher->replaceAll(&replText, &destText, status);
2513 REGEX_CHECK_STATUS;
2514 REGEX_ASSERT(result == &destText);
2515 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2516
2517 //
2518 // match whole string
2519 //
2520 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2521 utext_openUTF8(&dataText, str_abc, -1, &status);
2522 matcher->reset(&dataText);
2523
2524 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2525 utext_openUTF8(&replText, str_xyz, -1, &status);
2526 result = matcher->replaceFirst(&replText, NULL, status);
2527 REGEX_CHECK_STATUS;
2528 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2529 utext_close(result);
2530 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2531 result = matcher->replaceFirst(&replText, &destText, status);
2532 REGEX_CHECK_STATUS;
2533 REGEX_ASSERT(result == &destText);
2534 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2535
2536 result = matcher->replaceAll(&replText, NULL, status);
2537 REGEX_CHECK_STATUS;
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539 utext_close(result);
2540 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2541 result = matcher->replaceAll(&replText, &destText, status);
2542 REGEX_CHECK_STATUS;
2543 REGEX_ASSERT(result == &destText);
2544 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2545
2546 //
2547 // Capture Group, simple case
2548 //
2549 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2550 utext_openUTF8(&re, str_add, -1, &status);
2551 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2552 REGEX_CHECK_STATUS;
2553
2554 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2555 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
4388f060 2556 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
729e4ab9 2557 REGEX_CHECK_STATUS;
57a6839d 2558
729e4ab9
A
2559 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2560 utext_openUTF8(&replText, str_11, -1, &status);
2561 result = matcher2->replaceFirst(&replText, NULL, status);
2562 REGEX_CHECK_STATUS;
2563 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2564 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2565 utext_close(result);
2566 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567 result = matcher2->replaceFirst(&replText, &destText, status);
2568 REGEX_CHECK_STATUS;
2569 REGEX_ASSERT(result == &destText);
2570 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
57a6839d
A
2571
2572 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
4388f060
A
2573 utext_openUTF8(&replText, str_v, -1, &status);
2574 REGEX_VERBOSE_TEXT(&replText);
729e4ab9
A
2575 result = matcher2->replaceFirst(&replText, NULL, status);
2576 REGEX_CHECK_STATUS;
2577 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2578 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2579 utext_close(result);
2580 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2581 result = matcher2->replaceFirst(&replText, &destText, status);
2582 REGEX_CHECK_STATUS;
2583 REGEX_ASSERT(result == &destText);
2584 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
57a6839d 2585
b331163b
A
2586 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2587 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2588 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
729e4ab9
A
2589 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2590 result = matcher2->replaceFirst(&replText, NULL, status);
2591 REGEX_CHECK_STATUS;
2592 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2593 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2594 utext_close(result);
2595 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2596 result = matcher2->replaceFirst(&replText, &destText, status);
2597 REGEX_CHECK_STATUS;
2598 REGEX_ASSERT(result == &destText);
2599 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2600
2601 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2602 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2603 // 012345678901234567890123456
2604 supplDigitChars[22] = 0xF0;
2605 supplDigitChars[23] = 0x9D;
2606 supplDigitChars[24] = 0x9F;
2607 supplDigitChars[25] = 0x8F;
2608 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
57a6839d 2609
729e4ab9
A
2610 result = matcher2->replaceFirst(&replText, NULL, status);
2611 REGEX_CHECK_STATUS;
2612 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2613 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2614 utext_close(result);
2615 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2616 result = matcher2->replaceFirst(&replText, &destText, status);
2617 REGEX_CHECK_STATUS;
2618 REGEX_ASSERT(result == &destText);
2619 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2620 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2621 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2622 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2623// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2624 utext_close(result);
2625 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2626 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2627 REGEX_ASSERT(result == &destText);
2628// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2629
2630 //
2631 // Replacement String with \u hex escapes
2632 //
2633 {
2634 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2635 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2636 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2637 utext_openUTF8(&replText, str_u0043, -1, &status);
2638 matcher->reset(&dataText);
57a6839d 2639
729e4ab9
A
2640 result = matcher->replaceAll(&replText, NULL, status);
2641 REGEX_CHECK_STATUS;
2642 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2643 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2644 utext_close(result);
2645 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2646 result = matcher->replaceAll(&replText, &destText, status);
2647 REGEX_CHECK_STATUS;
2648 REGEX_ASSERT(result == &destText);
2649 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2650 }
2651 {
2652 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2653 utext_openUTF8(&dataText, str_abc, -1, &status);
2654 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2655 utext_openUTF8(&replText, str_U00010000, -1, &status);
2656 matcher->reset(&dataText);
2657
2658 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
57a6839d 2659 // 0123456789
729e4ab9
A
2660 expected[2] = 0xF0;
2661 expected[3] = 0x90;
2662 expected[4] = 0x80;
2663 expected[5] = 0x80;
2664
2665 result = matcher->replaceAll(&replText, NULL, status);
2666 REGEX_CHECK_STATUS;
2667 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2668 utext_close(result);
2669 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2670 result = matcher->replaceAll(&replText, &destText, status);
2671 REGEX_CHECK_STATUS;
2672 REGEX_ASSERT(result == &destText);
2673 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2674 }
2675 // TODO: need more through testing of capture substitutions.
2676
2677 // Bug 4057
2678 //
2679 {
2680 status = U_ZERO_ERROR;
2681const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2682const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2683const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2684 utext_openUTF8(&re, str_ssee, -1, &status);
2685 utext_openUTF8(&dataText, str_blah, -1, &status);
2686 utext_openUTF8(&replText, str_ooh, -1, &status);
57a6839d 2687
729e4ab9
A
2688 RegexMatcher m(&re, 0, status);
2689 REGEX_CHECK_STATUS;
57a6839d 2690
729e4ab9
A
2691 UnicodeString result;
2692 UText resultText = UTEXT_INITIALIZER;
2693 utext_openUnicodeString(&resultText, &result, &status);
2694
2695 // Multiple finds do NOT bump up the previous appendReplacement postion.
2696 m.reset(&dataText);
2697 m.find();
2698 m.find();
2699 m.appendReplacement(&resultText, &replText, status);
2700 REGEX_CHECK_STATUS;
2701 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2702 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2703
2704 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2705 status = U_ZERO_ERROR;
2706 result.truncate(0);
2707 utext_openUnicodeString(&resultText, &result, &status);
2708 m.reset(10, status);
2709 m.find();
2710 m.find();
2711 m.appendReplacement(&resultText, &replText, status);
2712 REGEX_CHECK_STATUS;
2713 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2714 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2715
2716 // find() at interior of string, appendReplacement still starts at beginning.
2717 status = U_ZERO_ERROR;
2718 result.truncate(0);
2719 utext_openUnicodeString(&resultText, &result, &status);
2720 m.reset();
2721 m.find(10, status);
2722 m.find();
2723 m.appendReplacement(&resultText, &replText, status);
2724 REGEX_CHECK_STATUS;
2725 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2726 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2727
2728 m.appendTail(&resultText, status);
2729 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2730 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
57a6839d 2731
729e4ab9
A
2732 utext_close(&resultText);
2733 }
2734
2735 delete matcher2;
2736 delete pat2;
2737 delete matcher;
2738 delete pat;
57a6839d 2739
729e4ab9
A
2740 utext_close(&dataText);
2741 utext_close(&replText);
2742 utext_close(&destText);
2743 utext_close(&re);
2744}
2745
2746
2747//---------------------------------------------------------------------------
2748//
2749// API_Pattern_UTF8 Test that the API for class RegexPattern is
2750// present and nominally working.
2751//
2752//---------------------------------------------------------------------------
2753void RegexTest::API_Pattern_UTF8() {
2754 RegexPattern pata; // Test default constructor to not crash.
2755 RegexPattern patb;
2756
2757 REGEX_ASSERT(pata == patb);
2758 REGEX_ASSERT(pata == pata);
2759
2760 UText re1 = UTEXT_INITIALIZER;
2761 UText re2 = UTEXT_INITIALIZER;
2762 UErrorCode status = U_ZERO_ERROR;
2763 UParseError pe;
57a6839d 2764
729e4ab9
A
2765 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2766 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2767 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2768 utext_openUTF8(&re2, str_def, -1, &status);
2769
2770 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2771 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2772 REGEX_CHECK_STATUS;
2773 REGEX_ASSERT(*pat1 == *pat1);
2774 REGEX_ASSERT(*pat1 != pata);
2775
2776 // Assign
2777 patb = *pat1;
2778 REGEX_ASSERT(patb == *pat1);
2779
2780 // Copy Construct
2781 RegexPattern patc(*pat1);
2782 REGEX_ASSERT(patc == *pat1);
2783 REGEX_ASSERT(patb == patc);
2784 REGEX_ASSERT(pat1 != pat2);
2785 patb = *pat2;
2786 REGEX_ASSERT(patb != patc);
2787 REGEX_ASSERT(patb == *pat2);
2788
2789 // Compile with no flags.
2790 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2791 REGEX_ASSERT(*pat1a == *pat1);
2792
2793 REGEX_ASSERT(pat1a->flags() == 0);
2794
2795 // Compile with different flags should be not equal
2796 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2797 REGEX_CHECK_STATUS;
2798
2799 REGEX_ASSERT(*pat1b != *pat1a);
2800 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2801 REGEX_ASSERT(pat1a->flags() == 0);
2802 delete pat1b;
2803
2804 // clone
2805 RegexPattern *pat1c = pat1->clone();
2806 REGEX_ASSERT(*pat1c == *pat1);
2807 REGEX_ASSERT(*pat1c != *pat2);
2808
2809 delete pat1c;
2810 delete pat1a;
2811 delete pat1;
2812 delete pat2;
57a6839d 2813
729e4ab9
A
2814 utext_close(&re1);
2815 utext_close(&re2);
2816
2817
2818 //
2819 // Verify that a matcher created from a cloned pattern works.
2820 // (Jitterbug 3423)
2821 //
2822 {
2823 UErrorCode status = U_ZERO_ERROR;
2824 UText pattern = UTEXT_INITIALIZER;
2825 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2826 utext_openUTF8(&pattern, str_pL, -1, &status);
57a6839d 2827
729e4ab9
A
2828 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2829 RegexPattern *pClone = pSource->clone();
2830 delete pSource;
2831 RegexMatcher *mFromClone = pClone->matcher(status);
2832 REGEX_CHECK_STATUS;
57a6839d 2833
729e4ab9
A
2834 UText input = UTEXT_INITIALIZER;
2835 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2836 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2837 mFromClone->reset(&input);
2838 REGEX_ASSERT(mFromClone->find() == TRUE);
2839 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2840 REGEX_ASSERT(mFromClone->find() == TRUE);
2841 REGEX_ASSERT(mFromClone->group(status) == "World");
2842 REGEX_ASSERT(mFromClone->find() == FALSE);
2843 delete mFromClone;
2844 delete pClone;
57a6839d 2845
729e4ab9
A
2846 utext_close(&input);
2847 utext_close(&pattern);
2848 }
2849
2850 //
2851 // matches convenience API
2852 //
2853 {
2854 UErrorCode status = U_ZERO_ERROR;
2855 UText pattern = UTEXT_INITIALIZER;
2856 UText input = UTEXT_INITIALIZER;
57a6839d 2857
729e4ab9
A
2858 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2859 utext_openUTF8(&input, str_randominput, -1, &status);
2860
2861 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2862 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2863 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2864 REGEX_CHECK_STATUS;
57a6839d 2865
729e4ab9
A
2866 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2867 utext_openUTF8(&pattern, str_abc, -1, &status);
2868 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2869 REGEX_CHECK_STATUS;
57a6839d 2870
729e4ab9
A
2871 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2872 utext_openUTF8(&pattern, str_nput, -1, &status);
2873 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2874 REGEX_CHECK_STATUS;
57a6839d 2875
729e4ab9
A
2876 utext_openUTF8(&pattern, str_randominput, -1, &status);
2877 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2878 REGEX_CHECK_STATUS;
2879
2880 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2881 utext_openUTF8(&pattern, str_u, -1, &status);
2882 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2883 REGEX_CHECK_STATUS;
57a6839d 2884
729e4ab9
A
2885 utext_openUTF8(&input, str_abc, -1, &status);
2886 utext_openUTF8(&pattern, str_abc, -1, &status);
2887 status = U_INDEX_OUTOFBOUNDS_ERROR;
2888 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2889 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
57a6839d 2890
729e4ab9
A
2891 utext_close(&input);
2892 utext_close(&pattern);
2893 }
2894
2895
2896 //
2897 // Split()
2898 //
2899 status = U_ZERO_ERROR;
2900 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2901 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2902 pat1 = RegexPattern::compile(&re1, pe, status);
2903 REGEX_CHECK_STATUS;
2904 UnicodeString fields[10];
2905
2906 int32_t n;
2907 n = pat1->split("Now is the time", fields, 10, status);
2908 REGEX_CHECK_STATUS;
2909 REGEX_ASSERT(n==4);
2910 REGEX_ASSERT(fields[0]=="Now");
2911 REGEX_ASSERT(fields[1]=="is");
2912 REGEX_ASSERT(fields[2]=="the");
2913 REGEX_ASSERT(fields[3]=="time");
2914 REGEX_ASSERT(fields[4]=="");
2915
2916 n = pat1->split("Now is the time", fields, 2, status);
2917 REGEX_CHECK_STATUS;
2918 REGEX_ASSERT(n==2);
2919 REGEX_ASSERT(fields[0]=="Now");
2920 REGEX_ASSERT(fields[1]=="is the time");
2921 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2922
2923 fields[1] = "*";
2924 status = U_ZERO_ERROR;
2925 n = pat1->split("Now is the time", fields, 1, status);
2926 REGEX_CHECK_STATUS;
2927 REGEX_ASSERT(n==1);
2928 REGEX_ASSERT(fields[0]=="Now is the time");
2929 REGEX_ASSERT(fields[1]=="*");
2930 status = U_ZERO_ERROR;
2931
2932 n = pat1->split(" Now is the time ", fields, 10, status);
2933 REGEX_CHECK_STATUS;
4388f060 2934 REGEX_ASSERT(n==6);
729e4ab9
A
2935 REGEX_ASSERT(fields[0]=="");
2936 REGEX_ASSERT(fields[1]=="Now");
2937 REGEX_ASSERT(fields[2]=="is");
2938 REGEX_ASSERT(fields[3]=="the");
2939 REGEX_ASSERT(fields[4]=="time");
2940 REGEX_ASSERT(fields[5]=="");
4388f060 2941 REGEX_ASSERT(fields[6]=="");
729e4ab9 2942
4388f060 2943 fields[2] = "*";
729e4ab9
A
2944 n = pat1->split(" ", fields, 10, status);
2945 REGEX_CHECK_STATUS;
4388f060 2946 REGEX_ASSERT(n==2);
729e4ab9 2947 REGEX_ASSERT(fields[0]=="");
4388f060
A
2948 REGEX_ASSERT(fields[1]=="");
2949 REGEX_ASSERT(fields[2]=="*");
729e4ab9
A
2950
2951 fields[0] = "foo";
2952 n = pat1->split("", fields, 10, status);
2953 REGEX_CHECK_STATUS;
2954 REGEX_ASSERT(n==0);
2955 REGEX_ASSERT(fields[0]=="foo");
2956
2957 delete pat1;
2958
2959 // split, with a pattern with (capture)
2960 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2961 pat1 = RegexPattern::compile(&re1, pe, status);
2962 REGEX_CHECK_STATUS;
2963
2964 status = U_ZERO_ERROR;
4388f060 2965 fields[6] = fields[7] = "*";
729e4ab9
A
2966 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2967 REGEX_CHECK_STATUS;
4388f060 2968 REGEX_ASSERT(n==7);
729e4ab9
A
2969 REGEX_ASSERT(fields[0]=="");
2970 REGEX_ASSERT(fields[1]=="a");
2971 REGEX_ASSERT(fields[2]=="Now is ");
2972 REGEX_ASSERT(fields[3]=="b");
2973 REGEX_ASSERT(fields[4]=="the time");
2974 REGEX_ASSERT(fields[5]=="c");
2975 REGEX_ASSERT(fields[6]=="");
4388f060 2976 REGEX_ASSERT(fields[7]=="*");
729e4ab9
A
2977 REGEX_ASSERT(status==U_ZERO_ERROR);
2978
4388f060 2979 fields[6] = fields[7] = "*";
729e4ab9
A
2980 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2981 REGEX_CHECK_STATUS;
4388f060 2982 REGEX_ASSERT(n==7);
729e4ab9
A
2983 REGEX_ASSERT(fields[0]==" ");
2984 REGEX_ASSERT(fields[1]=="a");
2985 REGEX_ASSERT(fields[2]=="Now is ");
2986 REGEX_ASSERT(fields[3]=="b");
2987 REGEX_ASSERT(fields[4]=="the time");
2988 REGEX_ASSERT(fields[5]=="c");
2989 REGEX_ASSERT(fields[6]=="");
4388f060 2990 REGEX_ASSERT(fields[7]=="*");
729e4ab9
A
2991
2992 status = U_ZERO_ERROR;
2993 fields[6] = "foo";
4388f060 2994 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
729e4ab9
A
2995 REGEX_CHECK_STATUS;
2996 REGEX_ASSERT(n==6);
2997 REGEX_ASSERT(fields[0]==" ");
2998 REGEX_ASSERT(fields[1]=="a");
2999 REGEX_ASSERT(fields[2]=="Now is ");
3000 REGEX_ASSERT(fields[3]=="b");
3001 REGEX_ASSERT(fields[4]=="the time");
4388f060 3002 REGEX_ASSERT(fields[5]==" ");
729e4ab9
A
3003 REGEX_ASSERT(fields[6]=="foo");
3004
3005 status = U_ZERO_ERROR;
3006 fields[5] = "foo";
3007 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3008 REGEX_CHECK_STATUS;
3009 REGEX_ASSERT(n==5);
3010 REGEX_ASSERT(fields[0]==" ");
3011 REGEX_ASSERT(fields[1]=="a");
3012 REGEX_ASSERT(fields[2]=="Now is ");
3013 REGEX_ASSERT(fields[3]=="b");
3014 REGEX_ASSERT(fields[4]=="the time<c>");
3015 REGEX_ASSERT(fields[5]=="foo");
3016
3017 status = U_ZERO_ERROR;
3018 fields[5] = "foo";
3019 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3020 REGEX_CHECK_STATUS;
3021 REGEX_ASSERT(n==5);
3022 REGEX_ASSERT(fields[0]==" ");
3023 REGEX_ASSERT(fields[1]=="a");
3024 REGEX_ASSERT(fields[2]=="Now is ");
3025 REGEX_ASSERT(fields[3]=="b");
3026 REGEX_ASSERT(fields[4]=="the time");
3027 REGEX_ASSERT(fields[5]=="foo");
3028
3029 status = U_ZERO_ERROR;
3030 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3031 REGEX_CHECK_STATUS;
3032 REGEX_ASSERT(n==4);
3033 REGEX_ASSERT(fields[0]==" ");
3034 REGEX_ASSERT(fields[1]=="a");
3035 REGEX_ASSERT(fields[2]=="Now is ");
3036 REGEX_ASSERT(fields[3]=="the time<c>");
3037 status = U_ZERO_ERROR;
3038 delete pat1;
3039
3040 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3041 pat1 = RegexPattern::compile(&re1, pe, status);
3042 REGEX_CHECK_STATUS;
3043 n = pat1->split("1-10,20", fields, 10, status);
3044 REGEX_CHECK_STATUS;
3045 REGEX_ASSERT(n==5);
3046 REGEX_ASSERT(fields[0]=="1");
3047 REGEX_ASSERT(fields[1]=="-");
3048 REGEX_ASSERT(fields[2]=="10");
3049 REGEX_ASSERT(fields[3]==",");
3050 REGEX_ASSERT(fields[4]=="20");
3051 delete pat1;
3052
3053
b331163b
A
3054 //
3055 // split of a UText based string, with library allocating output UTexts.
3056 //
3057 {
3058 status = U_ZERO_ERROR;
3059 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3060 UnicodeString stringToSplit("first:second:third");
3061 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3062 REGEX_CHECK_STATUS;
3063
3064 UText *splits[10] = {NULL};
3065 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3066 REGEX_CHECK_STATUS;
3067 REGEX_ASSERT(numFields == 5);
3068 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3069 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3070 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3071 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3072 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3073 REGEX_ASSERT(splits[5] == NULL);
3074
3075 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3076 if (splits[i]) {
3077 utext_close(splits[i]);
3078 splits[i] = NULL;
3079 }
3080 }
3081 utext_close(textToSplit);
3082 }
3083
3084
729e4ab9
A
3085 //
3086 // RegexPattern::pattern() and patternText()
3087 //
3088 pat1 = new RegexPattern();
3089 REGEX_ASSERT(pat1->pattern() == "");
3090 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3091 delete pat1;
4388f060
A
3092 const char *helloWorldInvariant = "(Hello, world)*";
3093 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
729e4ab9
A
3094 pat1 = RegexPattern::compile(&re1, pe, status);
3095 REGEX_CHECK_STATUS;
b331163b 3096 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
729e4ab9
A
3097 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3098 delete pat1;
3099
3100 utext_close(&re1);
3101}
3102
3103
3104//---------------------------------------------------------------------------
3105//
3106// Extended A more thorough check for features of regex patterns
3107// The test cases are in a separate data file,
3108// source/tests/testdata/regextst.txt
3109// A description of the test data format is included in that file.
3110//
3111//---------------------------------------------------------------------------
3112
3113const char *
3114RegexTest::getPath(char buffer[2048], const char *filename) {
3115 UErrorCode status=U_ZERO_ERROR;
3116 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3117 if (U_FAILURE(status)) {
3118 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3119 return NULL;
3120 }
3121
3122 strcpy(buffer, testDataDirectory);
3123 strcat(buffer, filename);
3124 return buffer;
3125}
3126
3127void RegexTest::Extended() {
3128 char tdd[2048];
3129 const char *srcPath;
3130 UErrorCode status = U_ZERO_ERROR;
3131 int32_t lineNum = 0;
3132
3133 //
3134 // Open and read the test data file.
3135 //
3136 srcPath=getPath(tdd, "regextst.txt");
3137 if(srcPath==NULL) {
3138 return; /* something went wrong, error already output */
3139 }
3140
3141 int32_t len;
3142 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3143 if (U_FAILURE(status)) {
3144 return; /* something went wrong, error already output */
3145 }
3146
3147 //
3148 // Put the test data into a UnicodeString
3149 //
3150 UnicodeString testString(FALSE, testData, len);
3151
3152 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3153 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
4388f060 3154 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
729e4ab9
A
3155
3156 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3157 UnicodeString testPattern; // The pattern for test from the test file.
3158 UnicodeString testFlags; // the flags for a test.
3159 UnicodeString matchString; // The marked up string to be used as input
3160
3161 if (U_FAILURE(status)){
57a6839d 3162 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
729e4ab9
A
3163 delete [] testData;
3164 return;
3165 }
3166
3167 //
3168 // Loop over the test data file, once per line.
3169 //
3170 while (lineMat.find()) {
3171 lineNum++;
3172 if (U_FAILURE(status)) {
3173 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3174 }
3175
3176 status = U_ZERO_ERROR;
3177 UnicodeString testLine = lineMat.group(1, status);
3178 if (testLine.length() == 0) {
3179 continue;
3180 }
3181
3182 //
3183 // Parse the test line. Skip blank and comment only lines.
3184 // Separate out the three main fields - pattern, flags, target.
3185 //
3186
3187 commentMat.reset(testLine);
3188 if (commentMat.lookingAt(status)) {
3189 // This line is a comment, or blank.
3190 continue;
3191 }
3192
3193 //
3194 // Pull out the pattern field, remove it from the test file line.
3195 //
3196 quotedStuffMat.reset(testLine);
3197 if (quotedStuffMat.lookingAt(status)) {
3198 testPattern = quotedStuffMat.group(2, status);
3199 testLine.remove(0, quotedStuffMat.end(0, status));
3200 } else {
3201 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3202 continue;
3203 }
3204
3205
3206 //
3207 // Pull out the flags from the test file line.
3208 //
3209 flagsMat.reset(testLine);
3210 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3211 testFlags = flagsMat.group(1, status);
3212 if (flagsMat.group(2, status).length() > 0) {
3213 errln("Bad Match flag at line %d. Scanning %c\n",
b75a7d8f
A
3214 lineNum, flagsMat.group(2, status).charAt(0));
3215 continue;
3216 }
729e4ab9
A
3217 testLine.remove(0, flagsMat.end(0, status));
3218
3219 //
3220 // Pull out the match string, as a whole.
3221 // We'll process the <tags> later.
3222 //
3223 quotedStuffMat.reset(testLine);
3224 if (quotedStuffMat.lookingAt(status)) {
3225 matchString = quotedStuffMat.group(2, status);
3226 testLine.remove(0, quotedStuffMat.end(0, status));
3227 } else {
3228 errln("Bad match string at test file line %d", lineNum);
3229 continue;
3230 }
3231
3232 //
3233 // The only thing left from the input line should be an optional trailing comment.
3234 //
3235 commentMat.reset(testLine);
3236 if (commentMat.lookingAt(status) == FALSE) {
3237 errln("Line %d: unexpected characters at end of test line.", lineNum);
3238 continue;
3239 }
3240
3241 //
3242 // Run the test
3243 //
3244 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3245 }
3246
3247 delete [] testData;
3248
3249}
3250
3251
3252
3253//---------------------------------------------------------------------------
3254//
3255// regex_find(pattern, flags, inputString, lineNumber)
3256//
3257// Function to run a single test from the Extended (data driven) tests.
3258// See file test/testdata/regextst.txt for a description of the
3259// pattern and inputString fields, and the allowed flags.
3260// lineNumber is the source line in regextst.txt of the test.
3261//
3262//---------------------------------------------------------------------------
3263
3264
3265// Set a value into a UVector at position specified by a decimal number in
3266// a UnicodeString. This is a utility function needed by the actual test function,
3267// which follows.
3268static void set(UVector &vec, int32_t val, UnicodeString index) {
3269 UErrorCode status=U_ZERO_ERROR;
3270 int32_t idx = 0;
3271 for (int32_t i=0; i<index.length(); i++) {
3272 int32_t d=u_charDigitValue(index.charAt(i));
3273 if (d<0) {return;}
3274 idx = idx*10 + d;
3275 }
3276 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3277 vec.setElementAt(val, idx);
3278}
3279
3280static void setInt(UVector &vec, int32_t val, int32_t idx) {
3281 UErrorCode status=U_ZERO_ERROR;
3282 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3283 vec.setElementAt(val, idx);
3284}
3285
3286static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3287{
3288 UBool couldFind = TRUE;
3289 UTEXT_SETNATIVEINDEX(utext, 0);
3290 int32_t i = 0;
3291 while (i < unistrOffset) {
3292 UChar32 c = UTEXT_NEXT32(utext);
3293 if (c != U_SENTINEL) {
3294 i += U16_LENGTH(c);
3295 } else {
3296 couldFind = FALSE;
3297 break;
3298 }
3299 }
4388f060 3300 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
729e4ab9
A
3301 return couldFind;
3302}
3303
3304
3305void RegexTest::regex_find(const UnicodeString &pattern,
3306 const UnicodeString &flags,
3307 const UnicodeString &inputString,
3308 const char *srcPath,
3309 int32_t line) {
3310 UnicodeString unEscapedInput;
3311 UnicodeString deTaggedInput;
57a6839d 3312
729e4ab9
A
3313 int32_t patternUTF8Length, inputUTF8Length;
3314 char *patternChars = NULL, *inputChars = NULL;
3315 UText patternText = UTEXT_INITIALIZER;
3316 UText inputText = UTEXT_INITIALIZER;
3317 UConverter *UTF8Converter = NULL;
3318
3319 UErrorCode status = U_ZERO_ERROR;
3320 UParseError pe;
3321 RegexPattern *parsePat = NULL;
3322 RegexMatcher *parseMatcher = NULL;
3323 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3324 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3325 UVector groupStarts(status);
3326 UVector groupEnds(status);
3327 UVector groupStartsUTF8(status);
3328 UVector groupEndsUTF8(status);
3329 UBool isMatch = FALSE, isUTF8Match = FALSE;
3330 UBool failed = FALSE;
3331 int32_t numFinds;
3332 int32_t i;
3333 UBool useMatchesFunc = FALSE;
3334 UBool useLookingAtFunc = FALSE;
3335 int32_t regionStart = -1;
3336 int32_t regionEnd = -1;
3337 int32_t regionStartUTF8 = -1;
3338 int32_t regionEndUTF8 = -1;
57a6839d 3339
729e4ab9
A
3340
3341 //
3342 // Compile the caller's pattern
3343 //
3344 uint32_t bflags = 0;
3345 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3346 bflags |= UREGEX_CASE_INSENSITIVE;
3347 }
3348 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3349 bflags |= UREGEX_COMMENTS;
3350 }
3351 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3352 bflags |= UREGEX_DOTALL;
3353 }
3354 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3355 bflags |= UREGEX_MULTILINE;
3356 }
57a6839d 3357
729e4ab9
A
3358 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3359 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3360 }
3361 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3362 bflags |= UREGEX_UNIX_LINES;
3363 }
4388f060
A
3364 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3365 bflags |= UREGEX_LITERAL;
3366 }
729e4ab9
A
3367
3368
3369 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3370 if (status != U_ZERO_ERROR) {
3371 #if UCONFIG_NO_BREAK_ITERATION==1
3372 // 'v' test flag means that the test pattern should not compile if ICU was configured
3373 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3374 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3375 goto cleanupAndReturn;
3376 }
3377 #endif
3378 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3379 // Expected pattern compilation error.
3380 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3381 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3382 }
3383 goto cleanupAndReturn;
3384 } else {
3385 // Unexpected pattern compilation error.
4388f060 3386 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
729e4ab9
A
3387 goto cleanupAndReturn;
3388 }
3389 }
3390
3391 UTF8Converter = ucnv_open("UTF8", &status);
3392 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
57a6839d 3393
729e4ab9
A
3394 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3395 status = U_ZERO_ERROR; // buffer overflow
3396 patternChars = new char[patternUTF8Length+1];
3397 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3398 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
57a6839d 3399
729e4ab9
A
3400 if (status == U_ZERO_ERROR) {
3401 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
57a6839d 3402
729e4ab9
A
3403 if (status != U_ZERO_ERROR) {
3404#if UCONFIG_NO_BREAK_ITERATION==1
3405 // 'v' test flag means that the test pattern should not compile if ICU was configured
3406 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3407 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3408 goto cleanupAndReturn;
3409 }
3410#endif
3411 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3412 // Expected pattern compilation error.
3413 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3414 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3415 }
3416 goto cleanupAndReturn;
3417 } else {
3418 // Unexpected pattern compilation error.
3419 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3420 goto cleanupAndReturn;
3421 }
3422 }
3423 }
57a6839d 3424
729e4ab9
A
3425 if (UTF8Pattern == NULL) {
3426 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3427 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3428 status = U_ZERO_ERROR;
3429 }
3430
3431 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
57a6839d 3432 callerPattern->dumpPattern();
729e4ab9
A
3433 }
3434
3435 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3436 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3437 goto cleanupAndReturn;
3438 }
3439
3440
3441 //
3442 // Number of times find() should be called on the test string, default to 1
3443 //
3444 numFinds = 1;
3445 for (i=2; i<=9; i++) {
3446 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3447 if (numFinds != 1) {
3448 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3449 goto cleanupAndReturn;
3450 }
3451 numFinds = i;
3452 }
3453 }
57a6839d 3454
729e4ab9
A
3455 // 'M' flag. Use matches() instead of find()
3456 if (flags.indexOf((UChar)0x4d) >= 0) {
3457 useMatchesFunc = TRUE;
3458 }
3459 if (flags.indexOf((UChar)0x4c) >= 0) {
3460 useLookingAtFunc = TRUE;
3461 }
3462
3463 //
3464 // Find the tags in the input data, remove them, and record the group boundary
3465 // positions.
3466 //
3467 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3468 REGEX_CHECK_STATUS_L(line);
3469
3470 unEscapedInput = inputString.unescape();
3471 parseMatcher = parsePat->matcher(unEscapedInput, status);
3472 REGEX_CHECK_STATUS_L(line);
3473 while(parseMatcher->find()) {
3474 parseMatcher->appendReplacement(deTaggedInput, "", status);
3475 REGEX_CHECK_STATUS;
3476 UnicodeString groupNum = parseMatcher->group(2, status);
3477 if (groupNum == "r") {
3478 // <r> or </r>, a region specification within the string
3479 if (parseMatcher->group(1, status) == "/") {
3480 regionEnd = deTaggedInput.length();
3481 } else {
3482 regionStart = deTaggedInput.length();
3483 }
3484 } else {
3485 // <digits> or </digits>, a group match boundary tag.
3486 if (parseMatcher->group(1, status) == "/") {
3487 set(groupEnds, deTaggedInput.length(), groupNum);
3488 } else {
3489 set(groupStarts, deTaggedInput.length(), groupNum);
3490 }
3491 }
3492 }
3493 parseMatcher->appendTail(deTaggedInput);
3494 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3495 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3496 errln("mismatched <r> tags");
3497 failed = TRUE;
3498 goto cleanupAndReturn;
3499 }
b75a7d8f 3500
729e4ab9
A
3501 //
3502 // Configure the matcher according to the flags specified with this test.
3503 //
3504 matcher = callerPattern->matcher(deTaggedInput, status);
3505 REGEX_CHECK_STATUS_L(line);
3506 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3507 matcher->setTrace(TRUE);
3508 }
57a6839d 3509
729e4ab9
A
3510 if (UTF8Pattern != NULL) {
3511 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3512 status = U_ZERO_ERROR; // buffer overflow
3513 inputChars = new char[inputUTF8Length+1];
3514 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3515 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3516
3517 if (status == U_ZERO_ERROR) {
4388f060 3518 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
729e4ab9
A
3519 REGEX_CHECK_STATUS_L(line);
3520 }
57a6839d 3521
729e4ab9
A
3522 if (UTF8Matcher == NULL) {
3523 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
2ca993e8 3524 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
729e4ab9
A
3525 status = U_ZERO_ERROR;
3526 }
3527 }
3528
3529 //
3530 // Generate native indices for UTF8 versions of region and capture group info
3531 //
3532 if (UTF8Matcher != NULL) {
2ca993e8
A
3533 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3534 UTF8Matcher->setTrace(TRUE);
3535 }
729e4ab9
A
3536 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3537 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
57a6839d 3538
729e4ab9
A
3539 // Fill out the native index UVector info.
3540 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3541 for (i=0; i<groupStarts.size(); i++) {
3542 int32_t start = groupStarts.elementAti(i);
3543 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3544 if (start >= 0) {
3545 int32_t startUTF8;
3546 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3547 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3548 failed = TRUE;
3549 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3550 }
3551 setInt(groupStartsUTF8, startUTF8, i);
3552 }
57a6839d 3553
729e4ab9
A
3554 int32_t end = groupEnds.elementAti(i);
3555 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3556 if (end >= 0) {
3557 int32_t endUTF8;
3558 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3559 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3560 failed = TRUE;
3561 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3562 }
3563 setInt(groupEndsUTF8, endUTF8, i);
3564 }
3565 }
3566 }
3567
3568 if (regionStart>=0) {
3569 matcher->region(regionStart, regionEnd, status);
3570 REGEX_CHECK_STATUS_L(line);
3571 if (UTF8Matcher != NULL) {
3572 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3573 REGEX_CHECK_STATUS_L(line);
3574 }
3575 }
3576 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3577 matcher->useAnchoringBounds(FALSE);
3578 if (UTF8Matcher != NULL) {
3579 UTF8Matcher->useAnchoringBounds(FALSE);
3580 }
3581 }
3582 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3583 matcher->useTransparentBounds(TRUE);
3584 if (UTF8Matcher != NULL) {
3585 UTF8Matcher->useTransparentBounds(TRUE);
3586 }
3587 }
57a6839d
A
3588
3589
729e4ab9
A
3590
3591 //
3592 // Do a find on the de-tagged input using the caller's pattern
3593 // TODO: error on count>1 and not find().
3594 // error on both matches() and lookingAt().
3595 //
3596 for (i=0; i<numFinds; i++) {
3597 if (useMatchesFunc) {
3598 isMatch = matcher->matches(status);
3599 if (UTF8Matcher != NULL) {
3600 isUTF8Match = UTF8Matcher->matches(status);
3601 }
3602 } else if (useLookingAtFunc) {
3603 isMatch = matcher->lookingAt(status);
3604 if (UTF8Matcher != NULL) {
3605 isUTF8Match = UTF8Matcher->lookingAt(status);
3606 }
b75a7d8f 3607 } else {
729e4ab9
A
3608 isMatch = matcher->find();
3609 if (UTF8Matcher != NULL) {
3610 isUTF8Match = UTF8Matcher->find();
3611 }
b75a7d8f 3612 }
729e4ab9
A
3613 }
3614 matcher->setTrace(FALSE);
2ca993e8
A
3615 if (UTF8Matcher) {
3616 UTF8Matcher->setTrace(FALSE);
3617 }
57a6839d
A
3618 if (U_FAILURE(status)) {
3619 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3620 }
b75a7d8f 3621
729e4ab9
A
3622 //
3623 // Match up the groups from the find() with the groups from the tags
3624 //
3625
3626 // number of tags should match number of groups from find operation.
3627 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3628 // G option in test means that capture group data is not available in the
3629 // expected results, so the check needs to be suppressed.
3630 if (isMatch == FALSE && groupStarts.size() != 0) {
4388f060 3631 dataerrln("Error at line %d: Match expected, but none found.", line);
729e4ab9
A
3632 failed = TRUE;
3633 goto cleanupAndReturn;
3634 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3635 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3636 failed = TRUE;
3637 goto cleanupAndReturn;
3638 }
2ca993e8
A
3639 if (isMatch && groupStarts.size() == 0) {
3640 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3641 failed = TRUE;
3642 }
3643 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3644 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3645 failed = TRUE;
3646 }
729e4ab9
A
3647
3648 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3649 // Only check for match / no match. Don't check capture groups.
729e4ab9
A
3650 goto cleanupAndReturn;
3651 }
3652
3653 REGEX_CHECK_STATUS_L(line);
3654 for (i=0; i<=matcher->groupCount(); i++) {
3655 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3656 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3657 if (matcher->start(i, status) != expectedStart) {
3658 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3659 line, i, expectedStart, matcher->start(i, status));
3660 failed = TRUE;
3661 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3662 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3663 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3664 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3665 failed = TRUE;
3666 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3667 }
57a6839d 3668
729e4ab9
A
3669 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3670 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3671 if (matcher->end(i, status) != expectedEnd) {
3672 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3673 line, i, expectedEnd, matcher->end(i, status));
3674 failed = TRUE;
3675 // Error on end position; keep going; real error is probably yet to come as group
3676 // end positions work from end of the input data towards the front.
3677 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3678 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3679 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3680 failed = TRUE;
3681 // Error on end position; keep going; real error is probably yet to come as group
3682 // end positions work from end of the input data towards the front.
3683 }
3684 }
3685 if ( matcher->groupCount()+1 < groupStarts.size()) {
3686 errln("Error at line %d: Expected %d capture groups, found %d.",
3687 line, groupStarts.size()-1, matcher->groupCount());
3688 failed = TRUE;
b75a7d8f 3689 }
729e4ab9
A
3690 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3691 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3692 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3693 failed = TRUE;
3694 }
3695
3696 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3697 matcher->requireEnd() == TRUE) {
3698 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3699 failed = TRUE;
3700 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3701 UTF8Matcher->requireEnd() == TRUE) {
3702 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3703 failed = TRUE;
3704 }
57a6839d 3705
729e4ab9
A
3706 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3707 matcher->requireEnd() == FALSE) {
3708 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3709 failed = TRUE;
3710 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3711 UTF8Matcher->requireEnd() == FALSE) {
3712 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3713 failed = TRUE;
3714 }
57a6839d 3715
729e4ab9
A
3716 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3717 matcher->hitEnd() == TRUE) {
3718 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3719 failed = TRUE;
3720 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3721 UTF8Matcher->hitEnd() == TRUE) {
3722 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3723 failed = TRUE;
3724 }
57a6839d 3725
729e4ab9
A
3726 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3727 matcher->hitEnd() == FALSE) {
3728 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3729 failed = TRUE;
3730 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3731 UTF8Matcher->hitEnd() == FALSE) {
3732 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3733 failed = TRUE;
3734 }
3735
3736
3737cleanupAndReturn:
3738 if (failed) {
3739 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3740 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3741 // callerPattern->dump();
3742 }
3743 delete parseMatcher;
3744 delete parsePat;
3745 delete UTF8Matcher;
3746 delete UTF8Pattern;
3747 delete matcher;
3748 delete callerPattern;
57a6839d 3749
729e4ab9
A
3750 utext_close(&inputText);
3751 delete[] inputChars;
3752 utext_close(&patternText);
3753 delete[] patternChars;
3754 ucnv_close(UTF8Converter);
3755}
3756
3757
3758
3759
3760//---------------------------------------------------------------------------
3761//
3762// Errors Check for error handling in patterns.
3763//
3764//---------------------------------------------------------------------------
3765void RegexTest::Errors() {
3766 // \escape sequences that aren't implemented yet.
3767 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3768
3769 // Missing close parentheses
3770 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3771 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3772 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3773
3774 // Extra close paren
3775 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3776 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3777 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3778
3779 // Look-ahead, Look-behind
3780 // TODO: add tests for unbounded length look-behinds.
3781 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3782
3783 // Attempt to use non-default flags
3784 {
3785 UParseError pe;
3786 UErrorCode status = U_ZERO_ERROR;
3787 int32_t flags = UREGEX_CANON_EQ |
3788 UREGEX_COMMENTS | UREGEX_DOTALL |
3789 UREGEX_MULTILINE;
3790 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3791 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3792 delete pat1;
3793 }
3794
3795
3796 // Quantifiers are allowed only after something that can be quantified.
3797 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3798 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3799 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3800
3801 // Mal-formed {min,max} quantifiers
3802 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3803 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3804 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3805 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3806 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3807 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
b331163b 3808 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
729e4ab9
A
3809 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3810 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
b75a7d8f 3811
729e4ab9
A
3812 // Ticket 5389
3813 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
b75a7d8f 3814
729e4ab9
A
3815 // Invalid Back Reference \0
3816 // For ICU 3.8 and earlier
3817 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3818 //
3819 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
b75a7d8f
A
3820
3821}
3822
3823
729e4ab9 3824//-------------------------------------------------------------------------------
57a6839d 3825//
729e4ab9
A
3826// Read a text data file, convert it to UChars, and return the data
3827// in one big UChar * buffer, which the caller must delete.
46f4442e 3828//
729e4ab9
A
3829//--------------------------------------------------------------------------------
3830UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3831 const char *defEncoding, UErrorCode &status) {
3832 UChar *retPtr = NULL;
3833 char *fileBuf = NULL;
3834 UConverter* conv = NULL;
3835 FILE *f = NULL;
46f4442e 3836
729e4ab9
A
3837 ulen = 0;
3838 if (U_FAILURE(status)) {
3839 return retPtr;
46f4442e 3840 }
46f4442e
A
3841
3842 //
729e4ab9 3843 // Open the file.
46f4442e 3844 //
729e4ab9
A
3845 f = fopen(fileName, "rb");
3846 if (f == 0) {
3847 dataerrln("Error opening test data file %s\n", fileName);
3848 status = U_FILE_ACCESS_ERROR;
3849 return NULL;
46f4442e 3850 }
729e4ab9
A
3851 //
3852 // Read it in
3853 //
3854 int32_t fileSize;
3855 int32_t amt_read;
3856
3857 fseek( f, 0, SEEK_END);
3858 fileSize = ftell(f);
3859 fileBuf = new char[fileSize];
3860 fseek(f, 0, SEEK_SET);
3861 amt_read = fread(fileBuf, 1, fileSize, f);
3862 if (amt_read != fileSize || fileSize <= 0) {
3863 errln("Error reading test data file.");
3864 goto cleanUpAndReturn;
46f4442e
A
3865 }
3866
729e4ab9
A
3867 //
3868 // Look for a Unicode Signature (BOM) on the data just read
3869 //
3870 int32_t signatureLength;
3871 const char * fileBufC;
3872 const char* encoding;
46f4442e 3873
729e4ab9
A
3874 fileBufC = fileBuf;
3875 encoding = ucnv_detectUnicodeSignature(
3876 fileBuf, fileSize, &signatureLength, &status);
3877 if(encoding!=NULL ){
3878 fileBufC += signatureLength;
3879 fileSize -= signatureLength;
3880 } else {
3881 encoding = defEncoding;
3882 if (strcmp(encoding, "utf-8") == 0) {
3883 errln("file %s is missing its BOM", fileName);
46f4442e
A
3884 }
3885 }
3886
729e4ab9
A
3887 //
3888 // Open a converter to take the rule file to UTF-16
3889 //
3890 conv = ucnv_open(encoding, &status);
3891 if (U_FAILURE(status)) {
3892 goto cleanUpAndReturn;
46f4442e
A
3893 }
3894
729e4ab9
A
3895 //
3896 // Convert the rules to UChar.
3897 // Preflight first to determine required buffer size.
3898 //
3899 ulen = ucnv_toUChars(conv,
3900 NULL, // dest,
3901 0, // destCapacity,
3902 fileBufC,
3903 fileSize,
3904 &status);
3905 if (status == U_BUFFER_OVERFLOW_ERROR) {
3906 // Buffer Overflow is expected from the preflight operation.
3907 status = U_ZERO_ERROR;
3908
3909 retPtr = new UChar[ulen+1];
3910 ucnv_toUChars(conv,
3911 retPtr, // dest,
3912 ulen+1,
3913 fileBufC,
3914 fileSize,
3915 &status);
46f4442e
A
3916 }
3917
729e4ab9
A
3918cleanUpAndReturn:
3919 fclose(f);
3920 delete[] fileBuf;
3921 ucnv_close(conv);
3922 if (U_FAILURE(status)) {
3923 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 3924 delete []retPtr;
729e4ab9
A
3925 retPtr = 0;
3926 ulen = 0;
3927 };
3928 return retPtr;
3929}
3930
3931
3932//-------------------------------------------------------------------------------
3933//
3934// PerlTests - Run Perl's regular expression tests
3935// The input file for this test is re_tests, the standard regular
3936// expression test data distributed with the Perl source code.
3937//
3938// Here is Perl's description of the test data file:
3939//
3940// # The tests are in a separate file 't/op/re_tests'.
3941// # Each line in that file is a separate test.
3942// # There are five columns, separated by tabs.
3943// #
3944// # Column 1 contains the pattern, optionally enclosed in C<''>.
3945// # Modifiers can be put after the closing C<'>.
3946// #
3947// # Column 2 contains the string to be matched.
3948// #
3949// # Column 3 contains the expected result:
3950// # y expect a match
3951// # n expect no match
3952// # c expect an error
3953// # B test exposes a known bug in Perl, should be skipped
3954// # b test exposes a known bug in Perl, should be skipped if noamp
3955// #
3956// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3957// #
3958// # Column 4 contains a string, usually C<$&>.
3959// #
3960// # Column 5 contains the expected result of double-quote
3961// # interpolating that string after the match, or start of error message.
3962// #
3963// # Column 6, if present, contains a reason why the test is skipped.
3964// # This is printed with "skipped", for harness to pick up.
3965// #
3966// # \n in the tests are interpolated, as are variables of the form ${\w+}.
3967// #
3968// # If you want to add a regular expression test that can't be expressed
3969// # in this format, don't add it here: put it in op/pat.t instead.
3970//
3971// For ICU, if field 3 contains an 'i', the test will be skipped.
3972// The test exposes is some known incompatibility between ICU and Perl regexps.
3973// (The i is in addition to whatever was there before.)
3974//
3975//-------------------------------------------------------------------------------
3976void RegexTest::PerlTests() {
3977 char tdd[2048];
3978 const char *srcPath;
3979 UErrorCode status = U_ZERO_ERROR;
3980 UParseError pe;
46f4442e
A
3981
3982 //
729e4ab9 3983 // Open and read the test data file.
46f4442e 3984 //
729e4ab9
A
3985 srcPath=getPath(tdd, "re_tests.txt");
3986 if(srcPath==NULL) {
3987 return; /* something went wrong, error already output */
46f4442e 3988 }
729e4ab9
A
3989
3990 int32_t len;
3991 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3992 if (U_FAILURE(status)) {
3993 return; /* something went wrong, error already output */
46f4442e
A
3994 }
3995
3996 //
729e4ab9 3997 // Put the test data into a UnicodeString
46f4442e 3998 //
729e4ab9 3999 UnicodeString testDataString(FALSE, testData, len);
46f4442e 4000
729e4ab9
A
4001 //
4002 // Regex to break the input file into lines, and strip the new lines.
4003 // One line per match, capture group one is the desired data.
4004 //
4005 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4006 if (U_FAILURE(status)) {
4007 dataerrln("RegexPattern::compile() error");
4008 return;
46f4442e 4009 }
729e4ab9 4010 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
46f4442e 4011
729e4ab9
A
4012 //
4013 // Regex to split a test file line into fields.
4014 // There are six fields, separated by tabs.
4015 //
4016 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
46f4442e
A
4017
4018 //
729e4ab9
A
4019 // Regex to identify test patterns with flag settings, and to separate them.
4020 // Test patterns with flags look like 'pattern'i
4021 // Test patterns without flags are not quoted: pattern
4022 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
46f4442e 4023 //
729e4ab9
A
4024 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4025 RegexMatcher* flagMat = flagPat->matcher(status);
46f4442e
A
4026
4027 //
729e4ab9
A
4028 // The Perl tests reference several perl-isms, which are evaluated/substituted
4029 // in the test data. Not being perl, this must be done explicitly. Here
4030 // are string constants and REs for these constructs.
46f4442e 4031 //
729e4ab9
A
4032 UnicodeString nulnulSrc("${nulnul}");
4033 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4034 nulnul = nulnul.unescape();
4035
4036 UnicodeString ffffSrc("${ffff}");
4037 UnicodeString ffff("\\uffff", -1, US_INV);
4038 ffff = ffff.unescape();
4039
4040 // regexp for $-[0], $+[2], etc.
4041 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4042 RegexMatcher *groupsMat = groupsPat->matcher(status);
4043
4044 // regexp for $0, $1, $2, etc.
4045 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4046 RegexMatcher *cgMat = cgPat->matcher(status);
4047
46f4442e
A
4048
4049 //
729e4ab9
A
4050 // Main Loop for the Perl Tests, runs once per line from the
4051 // test data file.
46f4442e 4052 //
729e4ab9
A
4053 int32_t lineNum = 0;
4054 int32_t skippedUnimplementedCount = 0;
4055 while (lineMat->find()) {
4056 lineNum++;
46f4442e 4057
729e4ab9
A
4058 //
4059 // Get a line, break it into its fields, do the Perl
4060 // variable substitutions.
4061 //
4062 UnicodeString line = lineMat->group(1, status);
4063 UnicodeString fields[7];
4064 fieldPat->split(line, fields, 7, status);
46f4442e 4065
729e4ab9
A
4066 flagMat->reset(fields[0]);
4067 flagMat->matches(status);
4068 UnicodeString pattern = flagMat->group(2, status);
4069 pattern.findAndReplace("${bang}", "!");
4070 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4071 pattern.findAndReplace(ffffSrc, ffff);
4072
4073 //
4074 // Identify patterns that include match flag settings,
4075 // split off the flags, remove the extra quotes.
4076 //
4077 UnicodeString flagStr = flagMat->group(3, status);
4078 if (U_FAILURE(status)) {
4079 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4080 return;
4081 }
4082 int32_t flags = 0;
4083 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4084 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4085 const UChar UChar_m = 0x6d;
4086 const UChar UChar_x = 0x78;
4087 const UChar UChar_y = 0x79;
4088 if (flagStr.indexOf(UChar_i) != -1) {
4089 flags |= UREGEX_CASE_INSENSITIVE;
4090 }
4091 if (flagStr.indexOf(UChar_m) != -1) {
4092 flags |= UREGEX_MULTILINE;
4093 }
4094 if (flagStr.indexOf(UChar_x) != -1) {
4095 flags |= UREGEX_COMMENTS;
46f4442e 4096 }
46f4442e 4097
729e4ab9
A
4098 //
4099 // Compile the test pattern.
4100 //
4101 status = U_ZERO_ERROR;
4102 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4103 if (status == U_REGEX_UNIMPLEMENTED) {
4104 //
4105 // Test of a feature that is planned for ICU, but not yet implemented.
4106 // skip the test.
4107 skippedUnimplementedCount++;
4108 delete testPat;
4109 status = U_ZERO_ERROR;
4110 continue;
46f4442e 4111 }
729e4ab9
A
4112
4113 if (U_FAILURE(status)) {
4114 // Some tests are supposed to generate errors.
4115 // Only report an error for tests that are supposed to succeed.
4116 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4117 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4118 {
4119 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4120 }
4121 status = U_ZERO_ERROR;
4122 delete testPat;
4123 continue;
46f4442e 4124 }
729e4ab9
A
4125
4126 if (fields[2].indexOf(UChar_i) >= 0) {
4127 // ICU should skip this test.
4128 delete testPat;
4129 continue;
46f4442e
A
4130 }
4131
729e4ab9
A
4132 if (fields[2].indexOf(UChar_c) >= 0) {
4133 // This pattern should have caused a compilation error, but didn't/
4134 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4135 delete testPat;
4136 continue;
4137 }
4138
4139 //
4140 // replace the Perl variables that appear in some of the
4141 // match data strings.
4142 //
4143 UnicodeString matchString = fields[1];
4144 matchString.findAndReplace(nulnulSrc, nulnul);
4145 matchString.findAndReplace(ffffSrc, ffff);
46f4442e 4146
729e4ab9
A
4147 // Replace any \n in the match string with an actual new-line char.
4148 // Don't do full unescape, as this unescapes more than Perl does, which
4149 // causes other spurious failures in the tests.
4150 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
46f4442e 4151
46f4442e
A
4152
4153
729e4ab9
A
4154 //
4155 // Run the test, check for expected match/don't match result.
4156 //
4157 RegexMatcher *testMat = testPat->matcher(matchString, status);
4158 UBool found = testMat->find();
4159 UBool expected = FALSE;
4160 if (fields[2].indexOf(UChar_y) >=0) {
4161 expected = TRUE;
4162 }
4163 if (expected != found) {
4164 errln("line %d: Expected %smatch, got %smatch",
4165 lineNum, expected?"":"no ", found?"":"no " );
4166 continue;
4167 }
57a6839d 4168
729e4ab9
A
4169 // Don't try to check expected results if there is no match.
4170 // (Some have stuff in the expected fields)
4171 if (!found) {
4172 delete testMat;
4173 delete testPat;
4174 continue;
4175 }
46f4442e 4176
729e4ab9
A
4177 //
4178 // Interpret the Perl expression from the fourth field of the data file,
4179 // building up an ICU string from the results of the ICU match.
4180 // The Perl expression will contain references to the results of
4181 // a regex match, including the matched string, capture group strings,
4182 // group starting and ending indicies, etc.
4183 //
4184 UnicodeString resultString;
4185 UnicodeString perlExpr = fields[3];
4186#if SUPPORT_MUTATING_INPUT_STRING
4187 groupsMat->reset(perlExpr);
4188 cgMat->reset(perlExpr);
4189#endif
46f4442e 4190
729e4ab9
A
4191 while (perlExpr.length() > 0) {
4192#if !SUPPORT_MUTATING_INPUT_STRING
4193 // Perferred usage. Reset after any modification to input string.
4194 groupsMat->reset(perlExpr);
4195 cgMat->reset(perlExpr);
4196#endif
b75a7d8f 4197
729e4ab9
A
4198 if (perlExpr.startsWith("$&")) {
4199 resultString.append(testMat->group(status));
4200 perlExpr.remove(0, 2);
4201 }
b75a7d8f 4202
729e4ab9
A
4203 else if (groupsMat->lookingAt(status)) {
4204 // $-[0] $+[2] etc.
4205 UnicodeString digitString = groupsMat->group(2, status);
4206 int32_t t = 0;
4207 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4208 UnicodeString plusOrMinus = groupsMat->group(1, status);
4209 int32_t matchPosition;
4210 if (plusOrMinus.compare("+") == 0) {
4211 matchPosition = testMat->end(groupNum, status);
4212 } else {
4213 matchPosition = testMat->start(groupNum, status);
4214 }
4215 if (matchPosition != -1) {
4216 ICU_Utility::appendNumber(resultString, matchPosition);
4217 }
4218 perlExpr.remove(0, groupsMat->end(status));
4219 }
b75a7d8f 4220
729e4ab9
A
4221 else if (cgMat->lookingAt(status)) {
4222 // $1, $2, $3, etc.
4223 UnicodeString digitString = cgMat->group(1, status);
4224 int32_t t = 0;
4225 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4226 if (U_SUCCESS(status)) {
4227 resultString.append(testMat->group(groupNum, status));
4228 status = U_ZERO_ERROR;
4229 }
4230 perlExpr.remove(0, cgMat->end(status));
4231 }
b75a7d8f 4232
729e4ab9
A
4233 else if (perlExpr.startsWith("@-")) {
4234 int32_t i;
4235 for (i=0; i<=testMat->groupCount(); i++) {
4236 if (i>0) {
4237 resultString.append(" ");
4238 }
4239 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4240 }
4241 perlExpr.remove(0, 2);
4242 }
b75a7d8f 4243
729e4ab9
A
4244 else if (perlExpr.startsWith("@+")) {
4245 int32_t i;
4246 for (i=0; i<=testMat->groupCount(); i++) {
4247 if (i>0) {
4248 resultString.append(" ");
4249 }
4250 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4251 }
4252 perlExpr.remove(0, 2);
4253 }
b75a7d8f 4254
729e4ab9
A
4255 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4256 // or as an escaped sequence (e.g. \n)
4257 if (perlExpr.length() > 1) {
4258 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4259 }
4260 UChar c = perlExpr.charAt(0);
4261 switch (c) {
4262 case 'n': c = '\n'; break;
4263 // add any other escape sequences that show up in the test expected results.
4264 }
4265 resultString.append(c);
4266 perlExpr.remove(0, 1);
4267 }
b75a7d8f 4268
729e4ab9
A
4269 else {
4270 // Any characters from the perl expression that we don't explicitly
4271 // recognize before here are assumed to be literals and copied
4272 // as-is to the expected results.
4273 resultString.append(perlExpr.charAt(0));
4274 perlExpr.remove(0, 1);
4275 }
374ca955 4276
729e4ab9
A
4277 if (U_FAILURE(status)) {
4278 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4279 break;
4280 }
4281 }
b75a7d8f 4282
729e4ab9
A
4283 //
4284 // Expected Results Compare
4285 //
4286 UnicodeString expectedS(fields[4]);
4287 expectedS.findAndReplace(nulnulSrc, nulnul);
4288 expectedS.findAndReplace(ffffSrc, ffff);
4289 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4290
4291
729e4ab9
A
4292 if (expectedS.compare(resultString) != 0) {
4293 err("Line %d: Incorrect perl expression results.", lineNum);
4294 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4295 }
374ca955 4296
729e4ab9
A
4297 delete testMat;
4298 delete testPat;
b75a7d8f 4299 }
374ca955 4300
b75a7d8f 4301 //
729e4ab9 4302 // All done. Clean up allocated stuff.
b75a7d8f 4303 //
729e4ab9
A
4304 delete cgMat;
4305 delete cgPat;
374ca955 4306
729e4ab9
A
4307 delete groupsMat;
4308 delete groupsPat;
374ca955 4309
729e4ab9
A
4310 delete flagMat;
4311 delete flagPat;
374ca955 4312
729e4ab9
A
4313 delete lineMat;
4314 delete linePat;
374ca955 4315
729e4ab9
A
4316 delete fieldPat;
4317 delete [] testData;
374ca955 4318
374ca955 4319
729e4ab9 4320 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
b75a7d8f 4321
b75a7d8f
A
4322}
4323
4324
4325//-------------------------------------------------------------------------------
4326//
729e4ab9
A
4327// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4328// (instead of using UnicodeStrings) to test the alternate engine.
4329// The input file for this test is re_tests, the standard regular
4330// expression test data distributed with the Perl source code.
4331// See PerlTests() for more information.
b75a7d8f
A
4332//
4333//-------------------------------------------------------------------------------
729e4ab9 4334void RegexTest::PerlTestsUTF8() {
374ca955
A
4335 char tdd[2048];
4336 const char *srcPath;
b75a7d8f
A
4337 UErrorCode status = U_ZERO_ERROR;
4338 UParseError pe;
729e4ab9
A
4339 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4340 UText patternText = UTEXT_INITIALIZER;
4341 char *patternChars = NULL;
4342 int32_t patternLength;
4343 int32_t patternCapacity = 0;
4344 UText inputText = UTEXT_INITIALIZER;
4345 char *inputChars = NULL;
4346 int32_t inputLength;
4347 int32_t inputCapacity = 0;
4348
4349 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
b75a7d8f
A
4350
4351 //
4352 // Open and read the test data file.
4353 //
374ca955
A
4354 srcPath=getPath(tdd, "re_tests.txt");
4355 if(srcPath==NULL) {
4356 return; /* something went wrong, error already output */
b75a7d8f
A
4357 }
4358
46f4442e
A
4359 int32_t len;
4360 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
374ca955
A
4361 if (U_FAILURE(status)) {
4362 return; /* something went wrong, error already output */
4363 }
b75a7d8f
A
4364
4365 //
4366 // Put the test data into a UnicodeString
4367 //
4368 UnicodeString testDataString(FALSE, testData, len);
4369
4370 //
4371 // Regex to break the input file into lines, and strip the new lines.
4372 // One line per match, capture group one is the desired data.
4373 //
46f4442e 4374 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
73c04bcf
A
4375 if (U_FAILURE(status)) {
4376 dataerrln("RegexPattern::compile() error");
4377 return;
4378 }
b75a7d8f
A
4379 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4380
4381 //
4382 // Regex to split a test file line into fields.
4383 // There are six fields, separated by tabs.
4384 //
46f4442e 4385 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
b75a7d8f
A
4386
4387 //
4388 // Regex to identify test patterns with flag settings, and to separate them.
4389 // Test patterns with flags look like 'pattern'i
4390 // Test patterns without flags are not quoted: pattern
4391 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4392 //
46f4442e 4393 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
374ca955 4394 RegexMatcher* flagMat = flagPat->matcher(status);
b75a7d8f
A
4395
4396 //
4397 // The Perl tests reference several perl-isms, which are evaluated/substituted
4398 // in the test data. Not being perl, this must be done explicitly. Here
4399 // are string constants and REs for these constructs.
4400 //
4401 UnicodeString nulnulSrc("${nulnul}");
46f4442e 4402 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
b75a7d8f
A
4403 nulnul = nulnul.unescape();
4404
4405 UnicodeString ffffSrc("${ffff}");
46f4442e 4406 UnicodeString ffff("\\uffff", -1, US_INV);
b75a7d8f
A
4407 ffff = ffff.unescape();
4408
4409 // regexp for $-[0], $+[2], etc.
46f4442e 4410 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
374ca955
A
4411 RegexMatcher *groupsMat = groupsPat->matcher(status);
4412
b75a7d8f 4413 // regexp for $0, $1, $2, etc.
46f4442e 4414 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
374ca955 4415 RegexMatcher *cgMat = cgPat->matcher(status);
b75a7d8f
A
4416
4417
4418 //
4419 // Main Loop for the Perl Tests, runs once per line from the
4420 // test data file.
4421 //
4422 int32_t lineNum = 0;
4423 int32_t skippedUnimplementedCount = 0;
4424 while (lineMat->find()) {
4425 lineNum++;
4426
4427 //
4428 // Get a line, break it into its fields, do the Perl
4429 // variable substitutions.
4430 //
4431 UnicodeString line = lineMat->group(1, status);
4432 UnicodeString fields[7];
4433 fieldPat->split(line, fields, 7, status);
4434
4435 flagMat->reset(fields[0]);
4436 flagMat->matches(status);
4437 UnicodeString pattern = flagMat->group(2, status);
4438 pattern.findAndReplace("${bang}", "!");
46f4442e 4439 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
b75a7d8f
A
4440 pattern.findAndReplace(ffffSrc, ffff);
4441
4442 //
4443 // Identify patterns that include match flag settings,
4444 // split off the flags, remove the extra quotes.
4445 //
4446 UnicodeString flagStr = flagMat->group(3, status);
4447 if (U_FAILURE(status)) {
4448 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4449 return;
4450 }
4451 int32_t flags = 0;
4452 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4453 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4454 const UChar UChar_m = 0x6d;
4455 const UChar UChar_x = 0x78;
4456 const UChar UChar_y = 0x79;
4457 if (flagStr.indexOf(UChar_i) != -1) {
4458 flags |= UREGEX_CASE_INSENSITIVE;
4459 }
4460 if (flagStr.indexOf(UChar_m) != -1) {
4461 flags |= UREGEX_MULTILINE;
4462 }
4463 if (flagStr.indexOf(UChar_x) != -1) {
4464 flags |= UREGEX_COMMENTS;
4465 }
57a6839d 4466
729e4ab9
A
4467 //
4468 // Put the pattern in a UTF-8 UText
4469 //
4470 status = U_ZERO_ERROR;
4471 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4472 if (status == U_BUFFER_OVERFLOW_ERROR) {
4473 status = U_ZERO_ERROR;
4474 delete[] patternChars;
4475 patternCapacity = patternLength + 1;
4476 patternChars = new char[patternCapacity];
4477 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4478 }
4479 utext_openUTF8(&patternText, patternChars, patternLength, &status);
b75a7d8f
A
4480
4481 //
4482 // Compile the test pattern.
4483 //
729e4ab9 4484 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
b75a7d8f
A
4485 if (status == U_REGEX_UNIMPLEMENTED) {
4486 //
4487 // Test of a feature that is planned for ICU, but not yet implemented.
4488 // skip the test.
4489 skippedUnimplementedCount++;
4490 delete testPat;
4491 status = U_ZERO_ERROR;
4492 continue;
4493 }
4494
4495 if (U_FAILURE(status)) {
4496 // Some tests are supposed to generate errors.
4497 // Only report an error for tests that are supposed to succeed.
4498 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4499 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4500 {
4501 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4502 }
4503 status = U_ZERO_ERROR;
4504 delete testPat;
4505 continue;
4506 }
4507
4508 if (fields[2].indexOf(UChar_i) >= 0) {
4509 // ICU should skip this test.
4510 delete testPat;
4511 continue;
4512 }
4513
4514 if (fields[2].indexOf(UChar_c) >= 0) {
4515 // This pattern should have caused a compilation error, but didn't/
4516 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4517 delete testPat;
4518 continue;
4519 }
4520
729e4ab9 4521
b75a7d8f
A
4522 //
4523 // replace the Perl variables that appear in some of the
374ca955 4524 // match data strings.
b75a7d8f
A
4525 //
4526 UnicodeString matchString = fields[1];
4527 matchString.findAndReplace(nulnulSrc, nulnul);
4528 matchString.findAndReplace(ffffSrc, ffff);
4529
4530 // Replace any \n in the match string with an actual new-line char.
4531 // Don't do full unescape, as this unescapes more than Perl does, which
4532 // causes other spurious failures in the tests.
46f4442e 4533 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
374ca955 4534
729e4ab9
A
4535 //
4536 // Put the input in a UTF-8 UText
4537 //
4538 status = U_ZERO_ERROR;
4539 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4540 if (status == U_BUFFER_OVERFLOW_ERROR) {
4541 status = U_ZERO_ERROR;
4542 delete[] inputChars;
4543 inputCapacity = inputLength + 1;
4544 inputChars = new char[inputCapacity];
4545 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4546 }
4547 utext_openUTF8(&inputText, inputChars, inputLength, &status);
b75a7d8f
A
4548
4549 //
4550 // Run the test, check for expected match/don't match result.
4551 //
4388f060 4552 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
b75a7d8f
A
4553 UBool found = testMat->find();
4554 UBool expected = FALSE;
4555 if (fields[2].indexOf(UChar_y) >=0) {
4556 expected = TRUE;
4557 }
4558 if (expected != found) {
374ca955 4559 errln("line %d: Expected %smatch, got %smatch",
b75a7d8f
A
4560 lineNum, expected?"":"no ", found?"":"no " );
4561 continue;
4562 }
57a6839d 4563
46f4442e
A
4564 // Don't try to check expected results if there is no match.
4565 // (Some have stuff in the expected fields)
4566 if (!found) {
4567 delete testMat;
4568 delete testPat;
4569 continue;
4570 }
b75a7d8f
A
4571
4572 //
4573 // Interpret the Perl expression from the fourth field of the data file,
4574 // building up an ICU string from the results of the ICU match.
374ca955 4575 // The Perl expression will contain references to the results of
b75a7d8f
A
4576 // a regex match, including the matched string, capture group strings,
4577 // group starting and ending indicies, etc.
4578 //
4579 UnicodeString resultString;
4580 UnicodeString perlExpr = fields[3];
b75a7d8f
A
4581
4582 while (perlExpr.length() > 0) {
729e4ab9
A
4583 groupsMat->reset(perlExpr);
4584 cgMat->reset(perlExpr);
4585
b75a7d8f
A
4586 if (perlExpr.startsWith("$&")) {
4587 resultString.append(testMat->group(status));
4588 perlExpr.remove(0, 2);
4589 }
4590
4591 else if (groupsMat->lookingAt(status)) {
4592 // $-[0] $+[2] etc.
4593 UnicodeString digitString = groupsMat->group(2, status);
4594 int32_t t = 0;
4595 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4596 UnicodeString plusOrMinus = groupsMat->group(1, status);
4597 int32_t matchPosition;
4598 if (plusOrMinus.compare("+") == 0) {
4599 matchPosition = testMat->end(groupNum, status);
4600 } else {
4601 matchPosition = testMat->start(groupNum, status);
4602 }
4603 if (matchPosition != -1) {
4604 ICU_Utility::appendNumber(resultString, matchPosition);
4605 }
4606 perlExpr.remove(0, groupsMat->end(status));
4607 }
4608
4609 else if (cgMat->lookingAt(status)) {
4610 // $1, $2, $3, etc.
4611 UnicodeString digitString = cgMat->group(1, status);
4612 int32_t t = 0;
4613 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4614 if (U_SUCCESS(status)) {
4615 resultString.append(testMat->group(groupNum, status));
4616 status = U_ZERO_ERROR;
4617 }
4618 perlExpr.remove(0, cgMat->end(status));
4619 }
4620
4621 else if (perlExpr.startsWith("@-")) {
46f4442e 4622 int32_t i;
b75a7d8f
A
4623 for (i=0; i<=testMat->groupCount(); i++) {
4624 if (i>0) {
4625 resultString.append(" ");
4626 }
4627 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4628 }
4629 perlExpr.remove(0, 2);
4630 }
4631
4632 else if (perlExpr.startsWith("@+")) {
46f4442e 4633 int32_t i;
b75a7d8f
A
4634 for (i=0; i<=testMat->groupCount(); i++) {
4635 if (i>0) {
4636 resultString.append(" ");
4637 }
4638 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4639 }
4640 perlExpr.remove(0, 2);
4641 }
4642
46f4442e 4643 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
b75a7d8f
A
4644 // or as an escaped sequence (e.g. \n)
4645 if (perlExpr.length() > 1) {
4646 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4647 }
4648 UChar c = perlExpr.charAt(0);
4649 switch (c) {
4650 case 'n': c = '\n'; break;
4651 // add any other escape sequences that show up in the test expected results.
4652 }
374ca955 4653 resultString.append(c);
b75a7d8f
A
4654 perlExpr.remove(0, 1);
4655 }
4656
4657 else {
4658 // Any characters from the perl expression that we don't explicitly
4659 // recognize before here are assumed to be literals and copied
4660 // as-is to the expected results.
4661 resultString.append(perlExpr.charAt(0));
4662 perlExpr.remove(0, 1);
4663 }
4664
4665 if (U_FAILURE(status)) {
4666 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4667 break;
4668 }
4669 }
374ca955 4670
b75a7d8f
A
4671 //
4672 // Expected Results Compare
4673 //
4674 UnicodeString expectedS(fields[4]);
4675 expectedS.findAndReplace(nulnulSrc, nulnul);
4676 expectedS.findAndReplace(ffffSrc, ffff);
46f4442e 4677 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4678
4679
4680 if (expectedS.compare(resultString) != 0) {
73c04bcf 4681 err("Line %d: Incorrect perl expression results.", lineNum);
729e4ab9 4682 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
b75a7d8f
A
4683 }
4684
4685 delete testMat;
4686 delete testPat;
4687 }
4688
4689 //
4690 // All done. Clean up allocated stuff.
4691 //
4692 delete cgMat;
4693 delete cgPat;
374ca955 4694
b75a7d8f
A
4695 delete groupsMat;
4696 delete groupsPat;
374ca955 4697
b75a7d8f
A
4698 delete flagMat;
4699 delete flagPat;
4700
4701 delete lineMat;
4702 delete linePat;
374ca955 4703
b75a7d8f
A
4704 delete fieldPat;
4705 delete [] testData;
57a6839d 4706
729e4ab9
A
4707 utext_close(&patternText);
4708 utext_close(&inputText);
57a6839d 4709
729e4ab9
A
4710 delete [] patternChars;
4711 delete [] inputChars;
374ca955 4712
b75a7d8f
A
4713
4714 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4715
4716}
4717
4718
729e4ab9
A
4719//--------------------------------------------------------------
4720//
4721// Bug6149 Verify limits to heap expansion for backtrack stack.
4722// Use this pattern,
57a6839d
A
4723// "(a?){1,8000000}"
4724// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4725// This test is likely to be fragile, as further optimizations stop
4726// more cases of pointless looping in the match engine.
729e4ab9
A
4727//
4728//---------------------------------------------------------------
4729void RegexTest::Bug6149() {
57a6839d 4730 UnicodeString pattern("(a?){1,8000000}");
729e4ab9
A
4731 UnicodeString s("xyz");
4732 uint32_t flags = 0;
4733 UErrorCode status = U_ZERO_ERROR;
4734
4735 RegexMatcher matcher(pattern, s, flags, status);
4736 UBool result = false;
4737 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4738 REGEX_ASSERT(result == FALSE);
4739 }
4740
4741
46f4442e
A
4742//
4743// Callbacks() Test the callback function.
4744// When set, callbacks occur periodically during matching operations,
4745// giving the application code the ability to abort the operation
4746// before it's normal completion.
4747//
4748
4749struct callBackContext {
4750 RegexTest *test;
4751 int32_t maxCalls;
4752 int32_t numCalls;
4753 int32_t lastSteps;
4754 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4755};
4756
4757U_CDECL_BEGIN
4758static UBool U_CALLCONV
4759testCallBackFn(const void *context, int32_t steps) {
4760 callBackContext *info = (callBackContext *)context;
4761 if (info->lastSteps+1 != steps) {
4762 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4763 }
4764 info->lastSteps = steps;
4765 info->numCalls++;
4766 return (info->numCalls < info->maxCalls);
4767}
4768U_CDECL_END
4769
4770void RegexTest::Callbacks() {
4771 {
4772 // Getter returns NULLs if no callback has been set
57a6839d 4773
46f4442e
A
4774 // The variables that the getter will fill in.
4775 // Init to non-null values so that the action of the getter can be seen.
4776 const void *returnedContext = &returnedContext;
4777 URegexMatchCallback *returnedFn = &testCallBackFn;
57a6839d 4778
46f4442e
A
4779 UErrorCode status = U_ZERO_ERROR;
4780 RegexMatcher matcher("x", 0, status);
4781 REGEX_CHECK_STATUS;
4782 matcher.getMatchCallback(returnedFn, returnedContext, status);
4783 REGEX_CHECK_STATUS;
4784 REGEX_ASSERT(returnedFn == NULL);
4785 REGEX_ASSERT(returnedContext == NULL);
4786 }
57a6839d 4787
46f4442e
A
4788 {
4789 // Set and Get work
4790 callBackContext cbInfo = {this, 0, 0, 0};
4791 const void *returnedContext;
4792 URegexMatchCallback *returnedFn;
4793 UErrorCode status = U_ZERO_ERROR;
4794 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4795 REGEX_CHECK_STATUS;
4796 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4797 REGEX_CHECK_STATUS;
4798 matcher.getMatchCallback(returnedFn, returnedContext, status);
4799 REGEX_CHECK_STATUS;
4800 REGEX_ASSERT(returnedFn == testCallBackFn);
4801 REGEX_ASSERT(returnedContext == &cbInfo);
57a6839d 4802
46f4442e
A
4803 // A short-running match shouldn't invoke the callback
4804 status = U_ZERO_ERROR;
4805 cbInfo.reset(1);
4806 UnicodeString s = "xxx";
4807 matcher.reset(s);
4808 REGEX_ASSERT(matcher.matches(status));
4809 REGEX_CHECK_STATUS;
4810 REGEX_ASSERT(cbInfo.numCalls == 0);
57a6839d 4811
46f4442e
A
4812 // A medium-length match that runs long enough to invoke the
4813 // callback, but not so long that the callback aborts it.
4814 status = U_ZERO_ERROR;
4815 cbInfo.reset(4);
4816 s = "aaaaaaaaaaaaaaaaaaab";
4817 matcher.reset(s);
4818 REGEX_ASSERT(matcher.matches(status)==FALSE);
4819 REGEX_CHECK_STATUS;
4820 REGEX_ASSERT(cbInfo.numCalls > 0);
57a6839d 4821
46f4442e
A
4822 // A longer running match that the callback function will abort.
4823 status = U_ZERO_ERROR;
4824 cbInfo.reset(4);
4825 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4826 matcher.reset(s);
4827 REGEX_ASSERT(matcher.matches(status)==FALSE);
4828 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4829 REGEX_ASSERT(cbInfo.numCalls == 4);
b331163b
A
4830
4831 // A longer running find that the callback function will abort.
4832 status = U_ZERO_ERROR;
4833 cbInfo.reset(4);
4834 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4835 matcher.reset(s);
4836 REGEX_ASSERT(matcher.find(status)==FALSE);
4837 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4838 REGEX_ASSERT(cbInfo.numCalls == 4);
46f4442e 4839 }
57a6839d 4840
46f4442e
A
4841
4842}
b75a7d8f 4843
729e4ab9
A
4844
4845//
4846// FindProgressCallbacks() Test the find "progress" callback function.
4847// When set, the find progress callback will be invoked during a find operations
4848// after each return from a match attempt, giving the application the opportunity
4849// to terminate a long-running find operation before it's normal completion.
4850//
4851
4852struct progressCallBackContext {
4853 RegexTest *test;
4854 int64_t lastIndex;
4855 int32_t maxCalls;
4856 int32_t numCalls;
4857 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4858};
4859
b331163b
A
4860// call-back function for find().
4861// Return TRUE to continue the find().
4862// Return FALSE to stop the find().
729e4ab9
A
4863U_CDECL_BEGIN
4864static UBool U_CALLCONV
4865testProgressCallBackFn(const void *context, int64_t matchIndex) {
4866 progressCallBackContext *info = (progressCallBackContext *)context;
4867 info->numCalls++;
4868 info->lastIndex = matchIndex;
4869// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4870 return (info->numCalls < info->maxCalls);
4871}
4872U_CDECL_END
4873
4874void RegexTest::FindProgressCallbacks() {
4875 {
4876 // Getter returns NULLs if no callback has been set
57a6839d 4877
729e4ab9
A
4878 // The variables that the getter will fill in.
4879 // Init to non-null values so that the action of the getter can be seen.
4880 const void *returnedContext = &returnedContext;
4881 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
57a6839d 4882
729e4ab9
A
4883 UErrorCode status = U_ZERO_ERROR;
4884 RegexMatcher matcher("x", 0, status);
4885 REGEX_CHECK_STATUS;
4886 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4887 REGEX_CHECK_STATUS;
4888 REGEX_ASSERT(returnedFn == NULL);
4889 REGEX_ASSERT(returnedContext == NULL);
4890 }
57a6839d 4891
729e4ab9
A
4892 {
4893 // Set and Get work
4894 progressCallBackContext cbInfo = {this, 0, 0, 0};
4895 const void *returnedContext;
4896 URegexFindProgressCallback *returnedFn;
4897 UErrorCode status = U_ZERO_ERROR;
b331163b 4898 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
729e4ab9
A
4899 REGEX_CHECK_STATUS;
4900 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4901 REGEX_CHECK_STATUS;
4902 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4903 REGEX_CHECK_STATUS;
4904 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4905 REGEX_ASSERT(returnedContext == &cbInfo);
57a6839d 4906
b331163b 4907 // A find that matches on the initial position does NOT invoke the callback.
729e4ab9
A
4908 status = U_ZERO_ERROR;
4909 cbInfo.reset(100);
b331163b 4910 UnicodeString s = "aaxxx";
729e4ab9
A
4911 matcher.reset(s);
4912#if 0
4913 matcher.setTrace(TRUE);
4914#endif
4915 REGEX_ASSERT(matcher.find(0, status));
4916 REGEX_CHECK_STATUS;
4917 REGEX_ASSERT(cbInfo.numCalls == 0);
57a6839d 4918
b331163b
A
4919 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4920 // but not so many times that we interrupt the operation.
729e4ab9
A
4921 status = U_ZERO_ERROR;
4922 s = "aaaaaaaaaaaaaaaaaaab";
4923 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4924 matcher.reset(s);
4925 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4926 REGEX_CHECK_STATUS;
4927 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
57a6839d 4928
729e4ab9
A
4929 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4930 status = U_ZERO_ERROR;
4931 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4932 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4933 matcher.reset(s1);
4934 REGEX_ASSERT(matcher.find(0, status)==FALSE);
b331163b 4935 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
729e4ab9
A
4936 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4937
729e4ab9
A
4938 // Now a match that will succeed, but after an interruption
4939 status = U_ZERO_ERROR;
4940 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4941 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4942 matcher.reset(s2);
4943 REGEX_ASSERT(matcher.find(0, status)==FALSE);
b331163b 4944 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
729e4ab9
A
4945 // Now retry the match from where left off
4946 cbInfo.maxCalls = 100; // No callback limit
b331163b 4947 status = U_ZERO_ERROR;
729e4ab9
A
4948 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4949 REGEX_CHECK_STATUS;
729e4ab9 4950 }
57a6839d 4951
729e4ab9
A
4952
4953}
4954
4955
4956//---------------------------------------------------------------------------
4957//
4958// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4959// UTexts. The pure-C implementation of UText
4960// has no mutable backing stores, but we can
4961// use UnicodeString here to test the functionality.
4962//
4963//---------------------------------------------------------------------------
4964void RegexTest::PreAllocatedUTextCAPI () {
4965 UErrorCode status = U_ZERO_ERROR;
4966 URegularExpression *re;
4967 UText patternText = UTEXT_INITIALIZER;
4968 UnicodeString buffer;
4969 UText bufferText = UTEXT_INITIALIZER;
57a6839d 4970
729e4ab9
A
4971 utext_openUnicodeString(&bufferText, &buffer, &status);
4972
4973 /*
4974 * getText() and getUText()
4975 */
4976 {
4977 UText text1 = UTEXT_INITIALIZER;
4978 UText text2 = UTEXT_INITIALIZER;
4979 UChar text2Chars[20];
4980 UText *resultText;
4981
4982 status = U_ZERO_ERROR;
4983 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4984 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4985 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4986 utext_openUChars(&text2, text2Chars, -1, &status);
57a6839d 4987
729e4ab9
A
4988 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4989 re = uregex_openUText(&patternText, 0, NULL, &status);
4990
4991 /* First set a UText */
4992 uregex_setUText(re, &text1, &status);
4993 resultText = uregex_getUText(re, &bufferText, &status);
4994 REGEX_CHECK_STATUS;
4995 REGEX_ASSERT(resultText == &bufferText);
4996 utext_setNativeIndex(resultText, 0);
4997 utext_setNativeIndex(&text1, 0);
4388f060 4998 REGEX_ASSERT(testUTextEqual(resultText, &text1));
57a6839d 4999
729e4ab9
A
5000 resultText = uregex_getUText(re, &bufferText, &status);
5001 REGEX_CHECK_STATUS;
5002 REGEX_ASSERT(resultText == &bufferText);
5003 utext_setNativeIndex(resultText, 0);
5004 utext_setNativeIndex(&text1, 0);
4388f060 5005 REGEX_ASSERT(testUTextEqual(resultText, &text1));
729e4ab9
A
5006
5007 /* Then set a UChar * */
5008 uregex_setText(re, text2Chars, 7, &status);
5009 resultText = uregex_getUText(re, &bufferText, &status);
5010 REGEX_CHECK_STATUS;
5011 REGEX_ASSERT(resultText == &bufferText);
5012 utext_setNativeIndex(resultText, 0);
5013 utext_setNativeIndex(&text2, 0);
4388f060 5014 REGEX_ASSERT(testUTextEqual(resultText, &text2));
57a6839d 5015
729e4ab9
A
5016 uregex_close(re);
5017 utext_close(&text1);
5018 utext_close(&text2);
5019 }
5020
5021 /*
5022 * group()
5023 */
5024 {
5025 UChar text1[80];
5026 UText *actual;
5027 UBool result;
b331163b
A
5028 int64_t length = 0;
5029
5030 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5031 // 012345678901234567890123456789012345678901234567
5032 // 0 1 2 3 4
729e4ab9
A
5033
5034 status = U_ZERO_ERROR;
5035 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5036 REGEX_CHECK_STATUS;
5037
5038 uregex_setText(re, text1, -1, &status);
5039 result = uregex_find(re, 0, &status);
5040 REGEX_ASSERT(result==TRUE);
5041
b331163b 5042 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
729e4ab9 5043 status = U_ZERO_ERROR;
b331163b 5044 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
729e4ab9
A
5045 REGEX_CHECK_STATUS;
5046 REGEX_ASSERT(actual == &bufferText);
b331163b
A
5047 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5048 REGEX_ASSERT(length == 16);
5049 REGEX_ASSERT(utext_nativeLength(actual) == 47);
729e4ab9 5050
b331163b 5051 /* Capture group #1. Should succeed, matching " interior ". */
729e4ab9 5052 status = U_ZERO_ERROR;
b331163b 5053 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
729e4ab9
A
5054 REGEX_CHECK_STATUS;
5055 REGEX_ASSERT(actual == &bufferText);
b331163b
A
5056 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5057 REGEX_ASSERT(length == 10);
5058 REGEX_ASSERT(utext_nativeLength(actual) == 47);
729e4ab9
A
5059
5060 /* Capture group out of range. Error. */
5061 status = U_ZERO_ERROR;
b331163b 5062 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
729e4ab9
A
5063 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5064 REGEX_ASSERT(actual == &bufferText);
729e4ab9
A
5065 uregex_close(re);
5066
5067 }
57a6839d 5068
729e4ab9
A
5069 /*
5070 * replaceFirst()
5071 */
5072 {
5073 UChar text1[80];
5074 UChar text2[80];
5075 UText replText = UTEXT_INITIALIZER;
5076 UText *result;
b331163b
A
5077 status = U_ZERO_ERROR;
5078 utext_openUnicodeString(&bufferText, &buffer, &status);
57a6839d 5079
729e4ab9 5080 status = U_ZERO_ERROR;
b331163b
A
5081 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5082 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
729e4ab9
A
5083 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5084
5085 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5086 REGEX_CHECK_STATUS;
5087
5088 /* Normal case, with match */
5089 uregex_setText(re, text1, -1, &status);
b331163b 5090 REGEX_CHECK_STATUS;
729e4ab9 5091 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
b331163b 5092 REGEX_CHECK_STATUS;
729e4ab9
A
5093 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5094 REGEX_CHECK_STATUS;
5095 REGEX_ASSERT(result == &bufferText);
5096 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5097
5098 /* No match. Text should copy to output with no changes. */
5099 uregex_setText(re, text2, -1, &status);
5100 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5101 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5102 REGEX_CHECK_STATUS;
5103 REGEX_ASSERT(result == &bufferText);
5104 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
57a6839d 5105
729e4ab9
A
5106 /* Unicode escapes */
5107 uregex_setText(re, text1, -1, &status);
b331163b 5108 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
729e4ab9
A
5109 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5110 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5111 REGEX_CHECK_STATUS;
5112 REGEX_ASSERT(result == &bufferText);
5113 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5114
5115 uregex_close(re);
5116 utext_close(&replText);
5117 }
5118
5119
5120 /*
5121 * replaceAll()
5122 */
5123 {
5124 UChar text1[80];
5125 UChar text2[80];
5126 UText replText = UTEXT_INITIALIZER;
5127 UText *result;
5128
5129 status = U_ZERO_ERROR;
5130 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5131 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5132 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5133
5134 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5135 REGEX_CHECK_STATUS;
5136
5137 /* Normal case, with match */
5138 uregex_setText(re, text1, -1, &status);
5139 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5140 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5141 REGEX_CHECK_STATUS;
5142 REGEX_ASSERT(result == &bufferText);
5143 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5144
5145 /* No match. Text should copy to output with no changes. */
5146 uregex_setText(re, text2, -1, &status);
5147 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5148 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5149 REGEX_CHECK_STATUS;
5150 REGEX_ASSERT(result == &bufferText);
5151 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5152
5153 uregex_close(re);
5154 utext_close(&replText);
5155 }
5156
5157
5158 /*
5159 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5160 * so we don't need to test it here.
5161 */
57a6839d 5162
729e4ab9
A
5163 utext_close(&bufferText);
5164 utext_close(&patternText);
5165}
5166
b331163b
A
5167
5168//--------------------------------------------------------------
5169//
5170// NamedCapture Check basic named capture group functionality
5171//
5172//--------------------------------------------------------------
5173void RegexTest::NamedCapture() {
5174 UErrorCode status = U_ZERO_ERROR;
5175 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5176 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5177 REGEX_CHECK_STATUS;
5178 int32_t group = pat->groupNumberFromName("five", -1, status);
5179 REGEX_CHECK_STATUS;
5180 REGEX_ASSERT(5 == group);
5181 group = pat->groupNumberFromName("three", -1, status);
5182 REGEX_CHECK_STATUS;
5183 REGEX_ASSERT(3 == group);
5184
5185 status = U_ZERO_ERROR;
5186 group = pat->groupNumberFromName(UnicodeString("six"), status);
5187 REGEX_CHECK_STATUS;
5188 REGEX_ASSERT(6 == group);
5189
5190 status = U_ZERO_ERROR;
5191 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5192 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5193
5194 status = U_ZERO_ERROR;
5195
5196 // After copying a pattern, named capture should still work in the copy.
5197 RegexPattern *copiedPat = new RegexPattern(*pat);
5198 REGEX_ASSERT(*copiedPat == *pat);
5199 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5200
5201 group = copiedPat->groupNumberFromName("five", -1, status);
5202 REGEX_CHECK_STATUS;
5203 REGEX_ASSERT(5 == group);
5204 group = copiedPat->groupNumberFromName("three", -1, status);
5205 REGEX_CHECK_STATUS;
5206 REGEX_ASSERT(3 == group);
5207 delete copiedPat;
5208
5209 // ReplaceAll with named capture group.
5210 status = U_ZERO_ERROR;
5211 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5212 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5213 REGEX_CHECK_STATUS;
5214 // m.pattern().dumpPattern();
5215 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5216 REGEX_CHECK_STATUS;
5217 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5218 delete m;
5219
5220 // ReplaceAll, allowed capture group numbers.
5221 text = UnicodeString("abcmxyz");
5222 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5223 REGEX_CHECK_STATUS;
5224
5225 status = U_ZERO_ERROR;
5226 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5227 REGEX_CHECK_STATUS;
5228 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5229
5230 status = U_ZERO_ERROR;
5231 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5232 REGEX_CHECK_STATUS;
5233 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5234
5235 status = U_ZERO_ERROR;
5236 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5237 REGEX_CHECK_STATUS;
5238 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5239
5240 status = U_ZERO_ERROR;
5241 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5242 REGEX_CHECK_STATUS;
5243 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5244
5245 status = U_ZERO_ERROR;
5246 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5247 REGEX_CHECK_STATUS;
5248 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5249
5250 status = U_ZERO_ERROR;
5251 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5252 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5253
5254 status = U_ZERO_ERROR;
5255 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5256 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5257 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5258
5259 status = U_ZERO_ERROR;
5260 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5261 REGEX_CHECK_STATUS; // that push group num out of range.
5262 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5263
5264 status = U_ZERO_ERROR;
5265 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5266 REGEX_CHECK_STATUS;
5267 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5268
5269 status = U_ZERO_ERROR;
5270 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5271 REGEX_CHECK_STATUS;
5272 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5273
5274 status = U_ZERO_ERROR;
5275 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5276 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5277
5278 status = U_ZERO_ERROR;
5279 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5280 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5281
5282 status = U_ZERO_ERROR;
5283 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5284 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5285
5286 status = U_ZERO_ERROR;
5287 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5288 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5289
5290 delete m;
5291
5292 // Repeat the above replaceAll() tests using the plain C API, which
5293 // has a separate implementation internally.
5294 // TODO: factor out the test data.
5295
5296 status = U_ZERO_ERROR;
5297 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5298 REGEX_CHECK_STATUS;
5299 text = UnicodeString("abcmxyz");
5300 uregex_setText(re, text.getBuffer(), text.length(), &status);
5301 REGEX_CHECK_STATUS;
5302
5303 UChar resultBuf[100];
5304 int32_t resultLength;
5305 UnicodeString repl;
5306
5307 status = U_ZERO_ERROR;
5308 repl = UnicodeString("<$0>");
5309 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5310 REGEX_CHECK_STATUS;
5311 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5312
5313 status = U_ZERO_ERROR;
5314 repl = UnicodeString("<$1>");
5315 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5316 REGEX_CHECK_STATUS;
5317 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5318
5319 status = U_ZERO_ERROR;
5320 repl = UnicodeString("<${one}>");
5321 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5322 REGEX_CHECK_STATUS;
5323 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5324
5325 status = U_ZERO_ERROR;
5326 repl = UnicodeString("<$2>");
5327 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5328 REGEX_CHECK_STATUS;
5329 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5330
5331 status = U_ZERO_ERROR;
5332 repl = UnicodeString("<$3>");
5333 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5334 REGEX_CHECK_STATUS;
5335 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5336
5337 status = U_ZERO_ERROR;
5338 repl = UnicodeString("<$4>");
5339 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5340 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5341
5342 status = U_ZERO_ERROR;
5343 repl = UnicodeString("<$04>");
5344 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5345 REGEX_CHECK_STATUS;
5346 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5347
5348 status = U_ZERO_ERROR;
5349 repl = UnicodeString("<$000016>");
5350 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5351 REGEX_CHECK_STATUS;
5352 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5353
5354 status = U_ZERO_ERROR;
5355 repl = UnicodeString("<$3$2$1${one}>");
5356 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5357 REGEX_CHECK_STATUS;
5358 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5359
5360 status = U_ZERO_ERROR;
5361 repl = UnicodeString("$3$2$1${one}");
5362 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5363 REGEX_CHECK_STATUS;
5364 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5365
5366 status = U_ZERO_ERROR;
5367 repl = UnicodeString("<${noSuchName}>");
5368 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5369 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5370
5371 status = U_ZERO_ERROR;
5372 repl = UnicodeString("<${invalid-name}>");
5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5375
5376 status = U_ZERO_ERROR;
5377 repl = UnicodeString("<${one");
5378 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5379 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5380
5381 status = U_ZERO_ERROR;
5382 repl = UnicodeString("$not a capture group");
5383 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5384 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5385
5386 uregex_close(re);
5387}
5388
5389//--------------------------------------------------------------
5390//
5391// NamedCaptureLimits Patterns with huge numbers of named capture groups.
5392// The point is not so much what the exact limit is,
5393// but that a largish number doesn't hit bad non-linear performance,
5394// and that exceeding the limit fails cleanly.
5395//
5396//--------------------------------------------------------------
5397void RegexTest::NamedCaptureLimits() {
5398 if (quick) {
5399 logln("Skipping test. Runs in exhuastive mode only.");
5400 return;
5401 }
5402 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5403 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5404 char nnbuf[100];
5405 UnicodeString pattern;
5406 int32_t nn;
5407
5408 for (nn=1; nn<goodLimit; nn++) {
5409 sprintf(nnbuf, "(?<nn%d>)", nn);
5410 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5411 }
5412 UErrorCode status = U_ZERO_ERROR;
5413 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5414 REGEX_CHECK_STATUS;
5415 for (nn=1; nn<goodLimit; nn++) {
5416 sprintf(nnbuf, "nn%d", nn);
5417 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5418 REGEX_ASSERT(nn == groupNum);
5419 if (nn != groupNum) {
5420 break;
5421 }
5422 }
5423 delete pat;
5424
5425 pattern.remove();
5426 for (nn=1; nn<failLimit; nn++) {
5427 sprintf(nnbuf, "(?<nn%d>)", nn);
5428 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5429 }
5430 status = U_ZERO_ERROR;
5431 pat = RegexPattern::compile(pattern, 0, status);
5432 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5433 delete pat;
5434}
5435
5436
729e4ab9
A
5437//--------------------------------------------------------------
5438//
5439// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5440//
5441//---------------------------------------------------------------
5442void RegexTest::Bug7651() {
5443 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5444 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5445 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5446 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5447 UnicodeString s("#ff @abcd This is test");
5448 RegexPattern *REPattern = NULL;
5449 RegexMatcher *REMatcher = NULL;
5450 UErrorCode status = U_ZERO_ERROR;
5451 UParseError pe;
5452
5453 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5454 REGEX_CHECK_STATUS;
5455 REMatcher = REPattern->matcher(s, status);
5456 REGEX_CHECK_STATUS;
5457 REGEX_ASSERT(REMatcher->find());
5458 REGEX_ASSERT(REMatcher->start(status) == 0);
5459 delete REPattern;
5460 delete REMatcher;
5461 status = U_ZERO_ERROR;
5462
5463 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5464 REGEX_CHECK_STATUS;
5465 REMatcher = REPattern->matcher(s, status);
5466 REGEX_CHECK_STATUS;
5467 REGEX_ASSERT(REMatcher->find());
5468 REGEX_ASSERT(REMatcher->start(status) == 0);
5469 delete REPattern;
5470 delete REMatcher;
5471 status = U_ZERO_ERROR;
5472 }
5473
5474void RegexTest::Bug7740() {
5475 UErrorCode status = U_ZERO_ERROR;
5476 UnicodeString pattern = "(a)";
5477 UnicodeString text = "abcdef";
5478 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5479 REGEX_CHECK_STATUS;
5480 REGEX_ASSERT(m->lookingAt(status));
5481 REGEX_CHECK_STATUS;
5482 status = U_ILLEGAL_ARGUMENT_ERROR;
5483 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5484 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5485 REGEX_ASSERT(s == "");
5486 delete m;
5487}
5488
4388f060
A
5489// Bug 8479: was crashing whith a Bogus UnicodeString as input.
5490
5491void RegexTest::Bug8479() {
5492 UErrorCode status = U_ZERO_ERROR;
729e4ab9 5493
4388f060
A
5494 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5495 REGEX_CHECK_STATUS;
5496 if (U_SUCCESS(status))
5497 {
5498 UnicodeString str;
5499 str.setToBogus();
5500 pMatcher->reset(str);
5501 status = U_ZERO_ERROR;
5502 pMatcher->matches(status);
5503 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5504 delete pMatcher;
5505 }
5506}
57a6839d 5507
729e4ab9 5508
4388f060
A
5509// Bug 7029
5510void RegexTest::Bug7029() {
5511 UErrorCode status = U_ZERO_ERROR;
5512
5513 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5514 UnicodeString text = "abc.def";
5515 UnicodeString splits[10];
5516 REGEX_CHECK_STATUS;
5517 int32_t numFields = pMatcher->split(text, splits, 10, status);
5518 REGEX_CHECK_STATUS;
5519 REGEX_ASSERT(numFields == 8);
5520 delete pMatcher;
5521}
5522
5523// Bug 9283
5524// This test is checking for the existance of any supplemental characters that case-fold
57a6839d 5525// to a bmp character.
4388f060 5526//
57a6839d
A
5527// At the time of this writing there are none. If any should appear in a subsequent release
5528// of Unicode, the code in regular expressions compilation that determines the longest
5529// posssible match for a literal string will need to be enhanced.
4388f060
A
5530//
5531// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5532// for details on what to do in case of a failure of this test.
5533//
5534void RegexTest::Bug9283() {
57a6839d 5535#if !UCONFIG_NO_NORMALIZATION
4388f060
A
5536 UErrorCode status = U_ZERO_ERROR;
5537 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5538 REGEX_CHECK_STATUS;
5539 int32_t index;
5540 UChar32 c;
5541 for (index=0; ; index++) {
5542 c = supplementalsWithCaseFolding.charAt(index);
5543 if (c == -1) {
5544 break;
5545 }
5546 UnicodeString cf = UnicodeString(c).foldCase();
5547 REGEX_ASSERT(cf.length() >= 2);
5548 }
57a6839d 5549#endif /* #if !UCONFIG_NO_NORMALIZATION */
4388f060
A
5550}
5551
5552
5553void RegexTest::CheckInvBufSize() {
5554 if(inv_next>=INV_BUFSIZ) {
5555 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5556 __FILE__, INV_BUFSIZ, inv_next);
5557 } else {
5558 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5559 }
5560}
5561
57a6839d
A
5562
5563void RegexTest::Bug10459() {
5564 UErrorCode status = U_ZERO_ERROR;
5565 UnicodeString patternString("(txt)");
5566 UnicodeString txtString("txt");
5567
5568 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5569 REGEX_CHECK_STATUS;
5570 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5571 REGEX_CHECK_STATUS;
5572
5573 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5574 REGEX_CHECK_STATUS;
5575
5576 uregex_setUText(icu_re, utext_txt, &status);
5577 REGEX_CHECK_STATUS;
5578
5579 // The bug was that calling uregex_group() before doing a matching operation
5580 // was causing a segfault. Only for Regular Expressions created from UText.
5581 // It should set an U_REGEX_INVALID_STATE.
5582
5583 UChar buf[100];
b331163b 5584 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
57a6839d
A
5585 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5586 REGEX_ASSERT(len == 0);
5587
5588 uregex_close(icu_re);
5589 utext_close(utext_pat);
5590 utext_close(utext_txt);
5591}
5592
b331163b
A
5593void RegexTest::TestCaseInsensitiveStarters() {
5594 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5595 // become stale because of new Unicode characters.
5596 // If it is stale, rerun the generation tool
5597 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5598 // and replace the embedded data in i18n/regexcmp.cpp
5599
5600 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5601 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5602 continue;
5603 }
5604 UnicodeSet s(cp, cp);
5605 s.closeOver(USET_CASE_INSENSITIVE);
5606 UnicodeSetIterator setIter(s);
5607 while (setIter.next()) {
5608 if (!setIter.isString()) {
5609 continue;
5610 }
5611 const UnicodeString &str = setIter.getString();
5612 UChar32 firstChar = str.char32At(0);
5613 UnicodeSet starters;
5614 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5615 if (!starters.contains(cp)) {
5616 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5617 return;
5618 }
5619 }
5620 }
5621}
5622
b75a7d8f 5623
b331163b
A
5624void RegexTest::TestBug11049() {
5625 // Original bug report: pattern with match start consisting of one of several individual characters,
5626 // and the text being matched ending with a supplementary character. find() would read past the
5627 // end of the input text when searching for potential match starting points.
5628
5629 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5630 // detect the bad read.
5631
5632 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5633 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5634
5635 // Test again with a pattern starting with a single character,
5636 // which takes a different code path than starting with an OR expression,
5637 // but with similar logic.
5638 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5639 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5640}
5641
5642// Run a single test case from TestBug11049(). Internal function.
5643void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5644 UErrorCode status = U_ZERO_ERROR;
5645 UnicodeString patternString = UnicodeString(pattern).unescape();
5646 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5647
5648 UnicodeString dataString = UnicodeString(data).unescape();
5649 UChar *exactBuffer = new UChar[dataString.length()];
5650 dataString.extract(exactBuffer, dataString.length(), status);
5651 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5652
5653 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5654 REGEX_CHECK_STATUS;
5655 matcher->reset(ut);
5656 UBool result = matcher->find();
5657 if (result != expectMatch) {
5658 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5659 __FILE__, lineNumber, expectMatch, result, pattern, data);
5660 }
5661
5662 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5663 // off-by-one on find() with match at the last code point.
5664 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5665 // because string.unescape() will only shrink it.
5666 char * utf8Buffer = new char[uprv_strlen(data)+1];
5667 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5668 REGEX_CHECK_STATUS;
5669 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5670 REGEX_CHECK_STATUS;
5671 matcher->reset(ut);
5672 result = matcher->find();
5673 if (result != expectMatch) {
5674 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5675 __FILE__, lineNumber, expectMatch, result, pattern, data);
5676 }
5677 delete [] utf8Buffer;
5678
5679 utext_close(ut);
5680 delete [] exactBuffer;
5681}
5682
5683
5684void RegexTest::TestBug11371() {
5685 if (quick) {
5686 logln("Skipping test. Runs in exhuastive mode only.");
5687 return;
5688 }
5689 UErrorCode status = U_ZERO_ERROR;
5690 UnicodeString patternString;
5691
5692 for (int i=0; i<8000000; i++) {
5693 patternString.append(UnicodeString("()"));
5694 }
5695 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5696 if (status != U_REGEX_PATTERN_TOO_BIG) {
5697 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5698 __FILE__, __LINE__, u_errorName(status));
5699 }
5700
5701 status = U_ZERO_ERROR;
5702 patternString = "(";
5703 for (int i=0; i<20000000; i++) {
5704 patternString.append(UnicodeString("A++"));
5705 }
5706 patternString.append(UnicodeString("){0}B++"));
5707 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5708 if (status != U_REGEX_PATTERN_TOO_BIG) {
5709 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5710 __FILE__, __LINE__, u_errorName(status));
5711 }
5712
5713 // Pattern with too much string data, such that string indexes overflow operand data field size
5714 // in compiled instruction.
5715 status = U_ZERO_ERROR;
5716 patternString = "";
5717 while (patternString.length() < 0x00ffffff) {
5718 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5719 }
5720 patternString.append(UnicodeString("X? trailing string"));
5721 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5722 if (status != U_REGEX_PATTERN_TOO_BIG) {
5723 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5724 __FILE__, __LINE__, u_errorName(status));
5725 }
5726}
5727
5728void RegexTest::TestBug11480() {
5729 // C API, get capture group of a group that does not participate in the match.
5730 // (Returns a zero length string, with nul termination,
2ca993e8 5731 // indistinguishable from a group with a zero length match.)
b331163b
A
5732
5733 UErrorCode status = U_ZERO_ERROR;
5734 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5735 REGEX_CHECK_STATUS;
5736 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5737 uregex_setText(re, text.getBuffer(), text.length(), &status);
5738 REGEX_CHECK_STATUS;
5739 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5740 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5741 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5742 REGEX_ASSERT(length == 0);
5743 REGEX_ASSERT(buf[0] == 13);
5744 REGEX_ASSERT(buf[1] == 0);
5745 REGEX_ASSERT(buf[2] == 13);
5746 uregex_close(re);
2ca993e8
A
5747
5748 // UText C++ API, length of match is 0 for non-participating matches.
5749 UText ut = UTEXT_INITIALIZER;
5750 utext_openUnicodeString(&ut, &text, &status);
5751 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5752 REGEX_CHECK_STATUS;
5753 matcher.reset(&ut);
5754 REGEX_ASSERT(matcher.lookingAt(0, status));
5755
5756 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5757 int64_t groupLen = -666;
5758 UText group = UTEXT_INITIALIZER;
5759 matcher.group(1, &group, groupLen, status);
5760 REGEX_CHECK_STATUS;
5761 REGEX_ASSERT(groupLen == 1);
5762 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5763
5764 // Capture group 2, the (B), does not participate in the match.
5765 matcher.group(2, &group, groupLen, status);
5766 REGEX_CHECK_STATUS;
5767 REGEX_ASSERT(groupLen == 0);
5768 REGEX_ASSERT(matcher.start(2, status) == -1);
5769 REGEX_CHECK_STATUS;
b331163b
A
5770}
5771
f3c0d7a5
A
5772void RegexTest::TestBug12884() {
5773 // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5774 UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5775 UnicodeString text(u"hello");
5776 UErrorCode status = U_ZERO_ERROR;
5777 RegexMatcher m(pattern, text, 0, status);
5778 REGEX_CHECK_STATUS;
5779 m.setTimeLimit(5, status);
5780 m.find(status);
5781 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5782
5783 // Non-greedy loops. They take a different code path during matching.
5784 UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5785 status = U_ZERO_ERROR;
5786 RegexMatcher ngM(ngPattern, text, 0, status);
5787 REGEX_CHECK_STATUS;
5788 ngM.setTimeLimit(5, status);
5789 ngM.find(status);
5790 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5791
5792 // UText, wrapping non-UTF-16 text, also takes a different execution path.
5793 const char *text8 = u8"¿Qué es Unicode? Unicode proporciona un número único para cada"
5794 "carácter, sin importar la plataforma, sin importar el programa,"
5795 "sin importar el idioma.";
5796 status = U_ZERO_ERROR;
5797 LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5798 REGEX_CHECK_STATUS;
5799 m.reset(ut.getAlias());
5800 m.find(status);
5801 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5802
5803 status = U_ZERO_ERROR;
5804 ngM.reset(ut.getAlias());
5805 ngM.find(status);
5806 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5807}
b331163b
A
5808
5809#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */