]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/regextst.cpp
ICU-531.48.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
57a6839d 3 * Copyright (c) 2002-2014, International Business Machines Corporation and
b75a7d8f
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8// regextst.cpp
9//
10// ICU Regular Expressions test, part of intltest.
11//
12
4388f060
A
13/*
14 NOTE!!
15
16 PLEASE be careful about ASCII assumptions in this test.
17 This test is one of the worst repeat offenders.
18 If you have questions, contact someone on the ICU PMC
19 who has access to an EBCDIC system.
20
21 */
22
374ca955 23#include "intltest.h"
b75a7d8f
A
24#if !UCONFIG_NO_REGULAR_EXPRESSIONS
25
374ca955 26#include "unicode/regex.h"
b75a7d8f
A
27#include "unicode/uchar.h"
28#include "unicode/ucnv.h"
4388f060 29#include "unicode/uniset.h"
57a6839d 30#include "unicode/uregex.h"
729e4ab9 31#include "unicode/ustring.h"
b75a7d8f
A
32#include "regextst.h"
33#include "uvector.h"
b75a7d8f 34#include "util.h"
374ca955 35#include <stdlib.h>
73c04bcf 36#include <string.h>
374ca955 37#include <stdio.h>
729e4ab9
A
38#include "cstring.h"
39#include "uinvchar.h"
b75a7d8f 40
729e4ab9 41#define SUPPORT_MUTATING_INPUT_STRING 0
b75a7d8f
A
42
43//---------------------------------------------------------------------------
44//
45// Test class boilerplate
46//
47//---------------------------------------------------------------------------
374ca955 48RegexTest::RegexTest()
b75a7d8f 49{
73c04bcf 50}
b75a7d8f
A
51
52
53RegexTest::~RegexTest()
54{
73c04bcf 55}
b75a7d8f
A
56
57
58
59void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
60{
61 if (exec) logln("TestSuite RegexTest: ");
62 switch (index) {
63
64 case 0: name = "Basic";
374ca955 65 if (exec) Basic();
b75a7d8f
A
66 break;
67 case 1: name = "API_Match";
374ca955 68 if (exec) API_Match();
b75a7d8f
A
69 break;
70 case 2: name = "API_Replace";
374ca955 71 if (exec) API_Replace();
b75a7d8f
A
72 break;
73 case 3: name = "API_Pattern";
374ca955 74 if (exec) API_Pattern();
b75a7d8f 75 break;
729e4ab9
A
76 case 4:
77#if !UCONFIG_NO_FILE_IO
78 name = "Extended";
374ca955 79 if (exec) Extended();
729e4ab9
A
80#else
81 name = "skip";
82#endif
b75a7d8f
A
83 break;
84 case 5: name = "Errors";
374ca955 85 if (exec) Errors();
b75a7d8f
A
86 break;
87 case 6: name = "PerlTests";
88 if (exec) PerlTests();
89 break;
46f4442e 90 case 7: name = "Callbacks";
729e4ab9
A
91 if (exec) Callbacks();
92 break;
93 case 8: name = "FindProgressCallbacks";
94 if (exec) FindProgressCallbacks();
95 break;
96 case 9: name = "Bug 6149";
97 if (exec) Bug6149();
98 break;
99 case 10: name = "UTextBasic";
100 if (exec) UTextBasic();
101 break;
102 case 11: name = "API_Match_UTF8";
103 if (exec) API_Match_UTF8();
104 break;
105 case 12: name = "API_Replace_UTF8";
106 if (exec) API_Replace_UTF8();
107 break;
108 case 13: name = "API_Pattern_UTF8";
109 if (exec) API_Pattern_UTF8();
110 break;
111 case 14: name = "PerlTestsUTF8";
112 if (exec) PerlTestsUTF8();
113 break;
114 case 15: name = "PreAllocatedUTextCAPI";
115 if (exec) PreAllocatedUTextCAPI();
46f4442e 116 break;
729e4ab9
A
117 case 16: name = "Bug 7651";
118 if (exec) Bug7651();
119 break;
120 case 17: name = "Bug 7740";
121 if (exec) Bug7740();
122 break;
4388f060
A
123 case 18: name = "Bug 8479";
124 if (exec) Bug8479();
125 break;
126 case 19: name = "Bug 7029";
127 if (exec) Bug7029();
128 break;
129 case 20: name = "CheckInvBufSize";
130 if (exec) CheckInvBufSize();
131 break;
132 case 21: name = "Bug 9283";
133 if (exec) Bug9283();
134 break;
57a6839d
A
135 case 22: name = "Bug10459";
136 if (exec) Bug10459();
137 break;
b75a7d8f 138
374ca955 139 default: name = "";
b75a7d8f
A
140 break; //needed to end loop
141 }
142}
143
144
4388f060 145
729e4ab9
A
146/**
147 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
57a6839d 148 * into ASCII.
729e4ab9
A
149 * @see utext_openUTF8
150 */
151static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
152
b75a7d8f
A
153//---------------------------------------------------------------------------
154//
155// Error Checking / Reporting macros used in all of the tests.
156//
157//---------------------------------------------------------------------------
b75a7d8f 158
729e4ab9
A
159static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
160 int64_t oldIndex = utext_getNativeIndex(text);
161 utext_setNativeIndex(text, 0);
162 char *bufPtr = buf;
163 UChar32 c = utext_next32From(text, 0);
164 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
165 if (0x000020<=c && c<0x00007e) {
166 *bufPtr = c;
167 } else {
168#if 0
169 sprintf(bufPtr,"U+%04X", c);
170 bufPtr+= strlen(bufPtr)-1;
171#else
172 *bufPtr = '%';
173#endif
174 }
175 bufPtr++;
176 c = UTEXT_NEXT32(text);
177 }
178 *bufPtr = 0;
179#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
180 char *ebuf = (char*)malloc(bufLen);
181 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
182 uprv_strncpy(buf, ebuf, bufLen);
183 free((void*)ebuf);
184#endif
185 utext_setNativeIndex(text, oldIndex);
186}
187
4388f060
A
188
189static char ASSERT_BUF[1024];
190
191const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
192 if(message.length()==0) {
193 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
194 } else {
195 UnicodeString buf;
196 IntlTest::prettify(message,buf);
197 if(buf.length()==0) {
198 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
199 } else {
200 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
201 if(ASSERT_BUF[0]==0) {
202 ASSERT_BUF[0]=0;
203 for(int32_t i=0;i<buf.length();i++) {
204 UChar ch = buf[i];
205 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
206 }
207 }
208 }
209 }
210 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
211 return ASSERT_BUF;
212}
213
57a6839d 214#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
4388f060 215
729e4ab9
A
216#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
217
218#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
219 __FILE__, __LINE__, u_errorName(status)); return;}}
220
221#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
b75a7d8f
A
222
223#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
729e4ab9 224if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
b75a7d8f
A
225 __LINE__, u_errorName(errcode), u_errorName(status));};}
226
227#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
228 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
229
230#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
231 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
232
4388f060
A
233#define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
234
235
236static UBool testUTextEqual(UText *uta, UText *utb) {
237 UChar32 ca = 0;
238 UChar32 cb = 0;
239 utext_setNativeIndex(uta, 0);
240 utext_setNativeIndex(utb, 0);
241 do {
242 ca = utext_next32(uta);
243 cb = utext_next32(utb);
244 if (ca != cb) {
245 break;
246 }
247 } while (ca != U_SENTINEL);
248 return ca == cb;
249}
250
251
729e4ab9
A
252/**
253 * @param expected expected text in UTF-8 (not platform) codepage
254 */
255void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
256 UErrorCode status = U_ZERO_ERROR;
257 UText expectedText = UTEXT_INITIALIZER;
258 utext_openUTF8(&expectedText, expected, -1, &status);
259 if(U_FAILURE(status)) {
260 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
261 return;
262 }
263 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
264 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
265 return;
266 }
267 utext_setNativeIndex(actual, 0);
4388f060 268 if (!testUTextEqual(&expectedText, actual)) {
729e4ab9
A
269 char buf[201 /*21*/];
270 char expectedBuf[201];
271 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
272 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
273 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
274 }
275 utext_close(&expectedText);
276}
277/**
278 * @param expected invariant (platform local text) input
279 */
280
281void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
282 UErrorCode status = U_ZERO_ERROR;
283 UText expectedText = UTEXT_INITIALIZER;
284 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
285 if(U_FAILURE(status)) {
286 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
287 return;
288 }
289 utext_setNativeIndex(actual, 0);
4388f060 290 if (!testUTextEqual(&expectedText, actual)) {
729e4ab9
A
291 char buf[201 /*21*/];
292 char expectedBuf[201];
293 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
294 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
295 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
296 }
297 utext_close(&expectedText);
298}
299
300/**
57a6839d 301 * Assumes utf-8 input
729e4ab9
A
302 */
303#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
304/**
57a6839d 305 * Assumes Invariant input
729e4ab9
A
306 */
307#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
b75a7d8f 308
4388f060
A
309/**
310 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
311 * passed into utext_openUTF8. An error will be given if
312 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
57a6839d 313 */
4388f060
A
314
315#define INV_BUFSIZ 2048 /* increase this if too small */
316
51004dcb 317static int64_t inv_next=0;
4388f060
A
318
319#if U_CHARSET_FAMILY!=U_ASCII_FAMILY
320static char inv_buf[INV_BUFSIZ];
321#endif
322
323static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
324 if(length==-1) length=strlen(inv);
325#if U_CHARSET_FAMILY==U_ASCII_FAMILY
326 inv_next+=length;
327 return utext_openUTF8(ut, inv, length, status);
328#else
329 if(inv_next+length+1>INV_BUFSIZ) {
330 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
331 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
332 *status = U_MEMORY_ALLOCATION_ERROR;
333 return NULL;
334 }
335
336 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
337 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
338 inv_next+=length;
339
340#if 0
341 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
342#endif
343
344 return utext_openUTF8(ut, (const char*)buf, length, status);
345#endif
346}
347
b75a7d8f
A
348
349//---------------------------------------------------------------------------
350//
351// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
352// for the LookingAt() and Match() functions.
353//
354// usage:
355// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
356//
357// The expected results are UBool - TRUE or FALSE.
358// The input text is unescaped. The pattern is not.
374ca955 359//
b75a7d8f
A
360//
361//---------------------------------------------------------------------------
362
729e4ab9 363#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
b75a7d8f 364
46f4442e
A
365UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
366 const UnicodeString pattern(pat, -1, US_INV);
367 const UnicodeString inputText(text, -1, US_INV);
b75a7d8f
A
368 UErrorCode status = U_ZERO_ERROR;
369 UParseError pe;
370 RegexPattern *REPattern = NULL;
371 RegexMatcher *REMatcher = NULL;
372 UBool retVal = TRUE;
373
46f4442e 374 UnicodeString patString(pat, -1, US_INV);
b75a7d8f
A
375 REPattern = RegexPattern::compile(patString, 0, pe, status);
376 if (U_FAILURE(status)) {
729e4ab9 377 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
b75a7d8f
A
378 line, u_errorName(status));
379 return FALSE;
380 }
57a6839d 381 if (line==376) { REPattern->dumpPattern();}
b75a7d8f
A
382
383 UnicodeString inputString(inputText);
384 UnicodeString unEscapedInput = inputString.unescape();
385 REMatcher = REPattern->matcher(unEscapedInput, status);
386 if (U_FAILURE(status)) {
387 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
388 line, u_errorName(status));
389 return FALSE;
390 }
374ca955 391
b75a7d8f
A
392 UBool actualmatch;
393 actualmatch = REMatcher->lookingAt(status);
394 if (U_FAILURE(status)) {
395 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
396 line, u_errorName(status));
397 retVal = FALSE;
398 }
399 if (actualmatch != looking) {
400 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
401 retVal = FALSE;
402 }
403
404 status = U_ZERO_ERROR;
405 actualmatch = REMatcher->matches(status);
406 if (U_FAILURE(status)) {
407 errln("RegexTest failure in matches() at line %d. Status = %s\n",
408 line, u_errorName(status));
409 retVal = FALSE;
410 }
411 if (actualmatch != match) {
412 errln("RegexTest: wrong return from matches() at line %d.\n", line);
413 retVal = FALSE;
414 }
415
416 if (retVal == FALSE) {
57a6839d 417 REPattern->dumpPattern();
b75a7d8f
A
418 }
419
420 delete REPattern;
421 delete REMatcher;
422 return retVal;
423}
374ca955 424
b75a7d8f 425
729e4ab9
A
426UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
427 UText pattern = UTEXT_INITIALIZER;
428 int32_t inputUTF8Length;
429 char *textChars = NULL;
430 UText inputText = UTEXT_INITIALIZER;
431 UErrorCode status = U_ZERO_ERROR;
432 UParseError pe;
433 RegexPattern *REPattern = NULL;
434 RegexMatcher *REMatcher = NULL;
435 UBool retVal = TRUE;
436
437 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
438 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
439 if (U_FAILURE(status)) {
440 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
441 line, u_errorName(status));
442 return FALSE;
443 }
57a6839d 444
729e4ab9
A
445 UnicodeString inputString(text, -1, US_INV);
446 UnicodeString unEscapedInput = inputString.unescape();
447 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
448 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
57a6839d 449
729e4ab9
A
450 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
451 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
452 // UTF-8 does not allow unpaired surrogates, so this could actually happen
453 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
454 return TRUE; // not a failure of the Regex engine
455 }
456 status = U_ZERO_ERROR; // buffer overflow
457 textChars = new char[inputUTF8Length+1];
458 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
459 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
57a6839d 460
4388f060 461 REMatcher = &REPattern->matcher(status)->reset(&inputText);
729e4ab9
A
462 if (U_FAILURE(status)) {
463 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
464 line, u_errorName(status));
465 return FALSE;
466 }
467
468 UBool actualmatch;
469 actualmatch = REMatcher->lookingAt(status);
470 if (U_FAILURE(status)) {
471 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
472 line, u_errorName(status));
473 retVal = FALSE;
474 }
475 if (actualmatch != looking) {
476 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
477 retVal = FALSE;
478 }
479
480 status = U_ZERO_ERROR;
481 actualmatch = REMatcher->matches(status);
482 if (U_FAILURE(status)) {
483 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
484 line, u_errorName(status));
485 retVal = FALSE;
486 }
487 if (actualmatch != match) {
488 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
489 retVal = FALSE;
490 }
491
492 if (retVal == FALSE) {
57a6839d 493 REPattern->dumpPattern();
729e4ab9
A
494 }
495
496 delete REPattern;
497 delete REMatcher;
498 utext_close(&inputText);
499 utext_close(&pattern);
500 delete[] textChars;
501 return retVal;
502}
b75a7d8f
A
503
504
b75a7d8f
A
505
506//---------------------------------------------------------------------------
507//
508// REGEX_ERR Macro + invocation function to simplify writing tests
509// regex tests for incorrect patterns
510//
511// usage:
512// REGEX_ERR("pattern", expected error line, column, expected status);
513//
514//---------------------------------------------------------------------------
515#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
516
517void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
46f4442e 518 UErrorCode expectedStatus, int32_t line) {
b75a7d8f
A
519 UnicodeString pattern(pat);
520
521 UErrorCode status = U_ZERO_ERROR;
522 UParseError pe;
523 RegexPattern *callerPattern = NULL;
524
525 //
526 // Compile the caller's pattern
527 //
528 UnicodeString patString(pat);
529 callerPattern = RegexPattern::compile(patString, 0, pe, status);
530 if (status != expectedStatus) {
729e4ab9 531 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
b75a7d8f
A
532 } else {
533 if (status != U_ZERO_ERROR) {
534 if (pe.line != errLine || pe.offset != errCol) {
535 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
536 line, errLine, errCol, pe.line, pe.offset);
537 }
538 }
539 }
540
541 delete callerPattern;
729e4ab9
A
542
543 //
544 // Compile again, using a UTF-8-based UText
545 //
546 UText patternText = UTEXT_INITIALIZER;
547 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
548 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
549 if (status != expectedStatus) {
550 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
551 } else {
552 if (status != U_ZERO_ERROR) {
553 if (pe.line != errLine || pe.offset != errCol) {
554 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
555 line, errLine, errCol, pe.line, pe.offset);
556 }
557 }
558 }
57a6839d 559
729e4ab9
A
560 delete callerPattern;
561 utext_close(&patternText);
b75a7d8f
A
562}
563
564
565
566//---------------------------------------------------------------------------
567//
568// Basic Check for basic functionality of regex pattern matching.
569// Avoid the use of REGEX_FIND test macro, which has
570// substantial dependencies on basic Regex functionality.
571//
572//---------------------------------------------------------------------------
573void RegexTest::Basic() {
574
575
576//
577// Debug - slide failing test cases early
578//
579#if 0
580 {
581 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
582 UParseError pe;
583 UErrorCode status = U_ZERO_ERROR;
4388f060
A
584 RegexPattern *pattern;
585 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
57a6839d 586 pattern->dumpPattern();
4388f060
A
587 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
588 UBool result = m->find();
589 printf("result = %d\n", result);
590 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
b75a7d8f
A
591 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
592 }
593 exit(1);
594#endif
595
596
597 //
598 // Pattern with parentheses
599 //
600 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
601 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
602 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
603
604 //
605 // Patterns with *
606 //
607 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
608 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
609 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
610 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
611 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
612
613 REGEX_TESTLM("a*", "", TRUE, TRUE);
614 REGEX_TESTLM("a*", "b", TRUE, FALSE);
615
616
617 //
618 // Patterns with "."
619 //
620 REGEX_TESTLM(".", "abc", TRUE, FALSE);
621 REGEX_TESTLM("...", "abc", TRUE, TRUE);
622 REGEX_TESTLM("....", "abc", FALSE, FALSE);
623 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
624 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
625 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
626 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
627 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
628
629 //
630 // Patterns with * applied to chars at end of literal string
631 //
632 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
633 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
634
635 //
636 // Supplemental chars match as single chars, not a pair of surrogates.
637 //
638 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
639 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
640 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
641
642
643 //
644 // UnicodeSets in the pattern
645 //
646 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
647 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
648 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
649 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
650 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
652
653 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
654 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
655 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
656 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
657 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
658
659 //
660 // OR operator in patterns
661 //
662 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
663 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
664 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
665 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
666
667 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
668 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
669 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
670 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
671 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
672 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
673
674 //
675 // +
676 //
677 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
678 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
679 REGEX_TESTLM("b+", "", FALSE, FALSE);
680 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
681 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
682 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
683
684 //
685 // ?
686 //
687 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
688 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
689 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
690 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
691 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
692 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
693 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
694 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
695 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
696
697 //
698 // Escape sequences that become single literal chars, handled internally
699 // by ICU's Unescape.
700 //
374ca955 701
b75a7d8f
A
702 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
703 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
374ca955
A
704 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
705 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
b75a7d8f
A
706 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
707 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
708 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
709 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
374ca955
A
710 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
711 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
b75a7d8f
A
712
713 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
714 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
715
716 // Escape of special chars in patterns
374ca955 717 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
729e4ab9 718}
b75a7d8f
A
719
720
729e4ab9
A
721//---------------------------------------------------------------------------
722//
723// UTextBasic Check for quirks that are specific to the UText
724// implementation.
725//
726//---------------------------------------------------------------------------
727void RegexTest::UTextBasic() {
728 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
729 UErrorCode status = U_ZERO_ERROR;
730 UText pattern = UTEXT_INITIALIZER;
731 utext_openUTF8(&pattern, str_abc, -1, &status);
732 RegexMatcher matcher(&pattern, 0, status);
733 REGEX_CHECK_STATUS;
57a6839d 734
729e4ab9
A
735 UText input = UTEXT_INITIALIZER;
736 utext_openUTF8(&input, str_abc, -1, &status);
737 REGEX_CHECK_STATUS;
738 matcher.reset(&input);
739 REGEX_CHECK_STATUS;
740 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
57a6839d 741
729e4ab9
A
742 matcher.reset(matcher.inputText());
743 REGEX_CHECK_STATUS;
744 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
57a6839d 745
729e4ab9
A
746 utext_close(&pattern);
747 utext_close(&input);
73c04bcf 748}
b75a7d8f
A
749
750
751//---------------------------------------------------------------------------
752//
374ca955 753// API_Match Test that the API for class RegexMatcher
b75a7d8f
A
754// is present and nominally working, but excluding functions
755// implementing replace operations.
756//
757//---------------------------------------------------------------------------
758void RegexTest::API_Match() {
759 UParseError pe;
760 UErrorCode status=U_ZERO_ERROR;
761 int32_t flags = 0;
762
763 //
764 // Debug - slide failing test cases early
765 //
766#if 0
767 {
768 }
769 return;
770#endif
771
772 //
773 // Simple pattern compilation
774 //
775 {
776 UnicodeString re("abc");
777 RegexPattern *pat2;
778 pat2 = RegexPattern::compile(re, flags, pe, status);
779 REGEX_CHECK_STATUS;
374ca955 780
b75a7d8f
A
781 UnicodeString inStr1 = "abcdef this is a test";
782 UnicodeString instr2 = "not abc";
783 UnicodeString empty = "";
374ca955
A
784
785
b75a7d8f
A
786 //
787 // Matcher creation and reset.
788 //
789 RegexMatcher *m1 = pat2->matcher(inStr1, status);
790 REGEX_CHECK_STATUS;
374ca955 791 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f
A
792 REGEX_ASSERT(m1->input() == inStr1);
793 m1->reset(instr2);
794 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
795 REGEX_ASSERT(m1->input() == instr2);
796 m1->reset(inStr1);
797 REGEX_ASSERT(m1->input() == inStr1);
798 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
799 m1->reset(empty);
800 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
801 REGEX_ASSERT(m1->input() == empty);
802 REGEX_ASSERT(&m1->pattern() == pat2);
374ca955
A
803
804 //
805 // reset(pos, status)
806 //
807 m1->reset(inStr1);
808 m1->reset(4, status);
809 REGEX_CHECK_STATUS;
810 REGEX_ASSERT(m1->input() == inStr1);
811 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
812
813 m1->reset(-1, status);
814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
815 status = U_ZERO_ERROR;
816
817 m1->reset(0, status);
818 REGEX_CHECK_STATUS;
819 status = U_ZERO_ERROR;
820
821 int32_t len = m1->input().length();
822 m1->reset(len-1, status);
823 REGEX_CHECK_STATUS;
824 status = U_ZERO_ERROR;
825
826 m1->reset(len, status);
729e4ab9
A
827 REGEX_CHECK_STATUS;
828 status = U_ZERO_ERROR;
829
830 m1->reset(len+1, status);
374ca955
A
831 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
832 status = U_ZERO_ERROR;
833
834 //
835 // match(pos, status)
836 //
837 m1->reset(instr2);
838 REGEX_ASSERT(m1->matches(4, status) == TRUE);
839 m1->reset();
840 REGEX_ASSERT(m1->matches(3, status) == FALSE);
841 m1->reset();
842 REGEX_ASSERT(m1->matches(5, status) == FALSE);
843 REGEX_ASSERT(m1->matches(4, status) == TRUE);
844 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
845 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
846
847 // Match() at end of string should fail, but should not
848 // be an error.
849 status = U_ZERO_ERROR;
850 len = m1->input().length();
851 REGEX_ASSERT(m1->matches(len, status) == FALSE);
852 REGEX_CHECK_STATUS;
853
854 // Match beyond end of string should fail with an error.
855 status = U_ZERO_ERROR;
856 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
857 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
858
859 // Successful match at end of string.
860 {
861 status = U_ZERO_ERROR;
862 RegexMatcher m("A?", 0, status); // will match zero length string.
863 REGEX_CHECK_STATUS;
864 m.reset(inStr1);
865 len = inStr1.length();
866 REGEX_ASSERT(m.matches(len, status) == TRUE);
867 REGEX_CHECK_STATUS;
868 m.reset(empty);
869 REGEX_ASSERT(m.matches(0, status) == TRUE);
870 REGEX_CHECK_STATUS;
871 }
872
873
874 //
875 // lookingAt(pos, status)
876 //
877 status = U_ZERO_ERROR;
878 m1->reset(instr2); // "not abc"
879 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
880 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
881 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
882 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
883 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
884 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885 status = U_ZERO_ERROR;
886 len = m1->input().length();
887 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
888 REGEX_CHECK_STATUS;
889 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
890 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
891
b75a7d8f
A
892 delete m1;
893 delete pat2;
894 }
895
896
897 //
374ca955 898 // Capture Group.
b75a7d8f
A
899 // RegexMatcher::start();
900 // RegexMatcher::end();
901 // RegexMatcher::groupCount();
902 //
903 {
904 int32_t flags=0;
905 UParseError pe;
906 UErrorCode status=U_ZERO_ERROR;
907
908 UnicodeString re("01(23(45)67)(.*)");
909 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
910 REGEX_CHECK_STATUS;
911 UnicodeString data = "0123456789";
374ca955 912
b75a7d8f
A
913 RegexMatcher *matcher = pat->matcher(data, status);
914 REGEX_CHECK_STATUS;
374ca955 915 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
46f4442e
A
916 static const int32_t matchStarts[] = {0, 2, 4, 8};
917 static const int32_t matchEnds[] = {10, 8, 6, 10};
918 int32_t i;
b75a7d8f
A
919 for (i=0; i<4; i++) {
920 int32_t actualStart = matcher->start(i, status);
921 REGEX_CHECK_STATUS;
922 if (actualStart != matchStarts[i]) {
923 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
924 __LINE__, i, matchStarts[i], actualStart);
925 }
926 int32_t actualEnd = matcher->end(i, status);
927 REGEX_CHECK_STATUS;
928 if (actualEnd != matchEnds[i]) {
929 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
930 __LINE__, i, matchEnds[i], actualEnd);
931 }
932 }
933
934 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
935 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
936
937 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
938 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
939 matcher->reset();
940 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
941
942 matcher->lookingAt(status);
943 REGEX_ASSERT(matcher->group(status) == "0123456789");
944 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
945 REGEX_ASSERT(matcher->group(1, status) == "234567" );
946 REGEX_ASSERT(matcher->group(2, status) == "45" );
947 REGEX_ASSERT(matcher->group(3, status) == "89" );
948 REGEX_CHECK_STATUS;
949 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
950 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
951 matcher->reset();
952 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
953
954 delete matcher;
955 delete pat;
956
957 }
958
959 //
960 // find
961 //
962 {
963 int32_t flags=0;
964 UParseError pe;
965 UErrorCode status=U_ZERO_ERROR;
966
967 UnicodeString re("abc");
968 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
969 REGEX_CHECK_STATUS;
970 UnicodeString data = ".abc..abc...abc..";
971 // 012345678901234567
374ca955 972
b75a7d8f
A
973 RegexMatcher *matcher = pat->matcher(data, status);
974 REGEX_CHECK_STATUS;
975 REGEX_ASSERT(matcher->find());
976 REGEX_ASSERT(matcher->start(status) == 1);
977 REGEX_ASSERT(matcher->find());
978 REGEX_ASSERT(matcher->start(status) == 6);
979 REGEX_ASSERT(matcher->find());
980 REGEX_ASSERT(matcher->start(status) == 12);
981 REGEX_ASSERT(matcher->find() == FALSE);
982 REGEX_ASSERT(matcher->find() == FALSE);
983
984 matcher->reset();
985 REGEX_ASSERT(matcher->find());
986 REGEX_ASSERT(matcher->start(status) == 1);
987
988 REGEX_ASSERT(matcher->find(0, status));
989 REGEX_ASSERT(matcher->start(status) == 1);
990 REGEX_ASSERT(matcher->find(1, status));
991 REGEX_ASSERT(matcher->start(status) == 1);
992 REGEX_ASSERT(matcher->find(2, status));
993 REGEX_ASSERT(matcher->start(status) == 6);
994 REGEX_ASSERT(matcher->find(12, status));
995 REGEX_ASSERT(matcher->start(status) == 12);
996 REGEX_ASSERT(matcher->find(13, status) == FALSE);
997 REGEX_ASSERT(matcher->find(16, status) == FALSE);
374ca955 998 REGEX_ASSERT(matcher->find(17, status) == FALSE);
b75a7d8f 999 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
b75a7d8f 1000
374ca955 1001 status = U_ZERO_ERROR;
b75a7d8f 1002 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
1003 status = U_ZERO_ERROR;
1004 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f
A
1005
1006 REGEX_ASSERT(matcher->groupCount() == 0);
1007
1008 delete matcher;
1009 delete pat;
1010 }
1011
1012
1013 //
1014 // find, with \G in pattern (true if at the end of a previous match).
1015 //
1016 {
1017 int32_t flags=0;
1018 UParseError pe;
1019 UErrorCode status=U_ZERO_ERROR;
1020
46f4442e 1021 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
b75a7d8f
A
1022 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1023 REGEX_CHECK_STATUS;
1024 UnicodeString data = ".abcabc.abc..";
1025 // 012345678901234567
374ca955 1026
b75a7d8f
A
1027 RegexMatcher *matcher = pat->matcher(data, status);
1028 REGEX_CHECK_STATUS;
1029 REGEX_ASSERT(matcher->find());
1030 REGEX_ASSERT(matcher->start(status) == 0);
374ca955 1031 REGEX_ASSERT(matcher->start(1, status) == -1);
b75a7d8f
A
1032 REGEX_ASSERT(matcher->start(2, status) == 1);
1033
1034 REGEX_ASSERT(matcher->find());
1035 REGEX_ASSERT(matcher->start(status) == 4);
374ca955 1036 REGEX_ASSERT(matcher->start(1, status) == 4);
b75a7d8f
A
1037 REGEX_ASSERT(matcher->start(2, status) == -1);
1038 REGEX_CHECK_STATUS;
1039
1040 delete matcher;
1041 delete pat;
1042 }
1043
374ca955
A
1044 //
1045 // find with zero length matches, match position should bump ahead
1046 // to prevent loops.
1047 //
1048 {
46f4442e 1049 int32_t i;
374ca955
A
1050 UErrorCode status=U_ZERO_ERROR;
1051 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1052 // using an always-true look-ahead.
1053 REGEX_CHECK_STATUS;
1054 UnicodeString s(" ");
1055 m.reset(s);
1056 for (i=0; ; i++) {
1057 if (m.find() == FALSE) {
1058 break;
1059 }
1060 REGEX_ASSERT(m.start(status) == i);
1061 REGEX_ASSERT(m.end(status) == i);
1062 }
1063 REGEX_ASSERT(i==5);
1064
1065 // Check that the bump goes over surrogate pairs OK
46f4442e 1066 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
374ca955
A
1067 s = s.unescape();
1068 m.reset(s);
1069 for (i=0; ; i+=2) {
1070 if (m.find() == FALSE) {
1071 break;
1072 }
1073 REGEX_ASSERT(m.start(status) == i);
1074 REGEX_ASSERT(m.end(status) == i);
1075 }
1076 REGEX_ASSERT(i==10);
1077 }
1078 {
1079 // find() loop breaking test.
1080 // with pattern of /.?/, should see a series of one char matches, then a single
1081 // match of zero length at the end of the input string.
46f4442e 1082 int32_t i;
374ca955
A
1083 UErrorCode status=U_ZERO_ERROR;
1084 RegexMatcher m(".?", 0, status);
1085 REGEX_CHECK_STATUS;
1086 UnicodeString s(" ");
1087 m.reset(s);
1088 for (i=0; ; i++) {
1089 if (m.find() == FALSE) {
1090 break;
1091 }
1092 REGEX_ASSERT(m.start(status) == i);
1093 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1094 }
1095 REGEX_ASSERT(i==5);
1096 }
1097
1098
b75a7d8f
A
1099 //
1100 // Matchers with no input string behave as if they had an empty input string.
1101 //
1102
1103 {
1104 UErrorCode status = U_ZERO_ERROR;
1105 RegexMatcher m(".?", 0, status);
1106 REGEX_CHECK_STATUS;
1107 REGEX_ASSERT(m.find());
1108 REGEX_ASSERT(m.start(status) == 0);
1109 REGEX_ASSERT(m.input() == "");
1110 }
1111 {
1112 UErrorCode status = U_ZERO_ERROR;
1113 RegexPattern *p = RegexPattern::compile(".", 0, status);
1114 RegexMatcher *m = p->matcher(status);
1115 REGEX_CHECK_STATUS;
374ca955 1116
b75a7d8f
A
1117 REGEX_ASSERT(m->find() == FALSE);
1118 REGEX_ASSERT(m->input() == "");
1119 delete m;
1120 delete p;
1121 }
57a6839d 1122
46f4442e
A
1123 //
1124 // Regions
1125 //
1126 {
1127 UErrorCode status = U_ZERO_ERROR;
1128 UnicodeString testString("This is test data");
1129 RegexMatcher m(".*", testString, 0, status);
1130 REGEX_CHECK_STATUS;
1131 REGEX_ASSERT(m.regionStart() == 0);
1132 REGEX_ASSERT(m.regionEnd() == testString.length());
1133 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1134 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 1135
46f4442e
A
1136 m.region(2,4, status);
1137 REGEX_CHECK_STATUS;
1138 REGEX_ASSERT(m.matches(status));
1139 REGEX_ASSERT(m.start(status)==2);
1140 REGEX_ASSERT(m.end(status)==4);
1141 REGEX_CHECK_STATUS;
57a6839d 1142
46f4442e
A
1143 m.reset();
1144 REGEX_ASSERT(m.regionStart() == 0);
1145 REGEX_ASSERT(m.regionEnd() == testString.length());
57a6839d 1146
46f4442e
A
1147 UnicodeString shorterString("short");
1148 m.reset(shorterString);
1149 REGEX_ASSERT(m.regionStart() == 0);
1150 REGEX_ASSERT(m.regionEnd() == shorterString.length());
57a6839d 1151
46f4442e
A
1152 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1153 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1154 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1155 REGEX_ASSERT(&m == &m.reset());
1156 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
57a6839d 1157
46f4442e
A
1158 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1159 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1160 REGEX_ASSERT(&m == &m.reset());
1161 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 1162
46f4442e
A
1163 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1164 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1165 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1166 REGEX_ASSERT(&m == &m.reset());
1167 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1168
1169 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1170 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1171 REGEX_ASSERT(&m == &m.reset());
1172 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
57a6839d 1173
46f4442e 1174 }
57a6839d 1175
46f4442e
A
1176 //
1177 // hitEnd() and requireEnd()
1178 //
1179 {
1180 UErrorCode status = U_ZERO_ERROR;
1181 UnicodeString testString("aabb");
1182 RegexMatcher m1(".*", testString, 0, status);
1183 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1184 REGEX_ASSERT(m1.hitEnd() == TRUE);
1185 REGEX_ASSERT(m1.requireEnd() == FALSE);
1186 REGEX_CHECK_STATUS;
57a6839d 1187
46f4442e
A
1188 status = U_ZERO_ERROR;
1189 RegexMatcher m2("a*", testString, 0, status);
1190 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1191 REGEX_ASSERT(m2.hitEnd() == FALSE);
1192 REGEX_ASSERT(m2.requireEnd() == FALSE);
1193 REGEX_CHECK_STATUS;
1194
1195 status = U_ZERO_ERROR;
1196 RegexMatcher m3(".*$", testString, 0, status);
1197 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1198 REGEX_ASSERT(m3.hitEnd() == TRUE);
1199 REGEX_ASSERT(m3.requireEnd() == TRUE);
1200 REGEX_CHECK_STATUS;
1201 }
1202
b75a7d8f 1203
374ca955
A
1204 //
1205 // Compilation error on reset with UChar *
1206 // These were a hazard that people were stumbling over with runtime errors.
1207 // Changed them to compiler errors by adding private methods that more closely
1208 // matched the incorrect use of the functions.
1209 //
1210#if 0
1211 {
1212 UErrorCode status = U_ZERO_ERROR;
1213 UChar ucharString[20];
1214 RegexMatcher m(".", 0, status);
1215 m.reset(ucharString); // should not compile.
1216
1217 RegexPattern *p = RegexPattern::compile(".", 0, status);
1218 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1219
1220 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1221 }
1222#endif
1223
46f4442e 1224 //
57a6839d 1225 // Time Outs.
46f4442e
A
1226 // Note: These tests will need to be changed when the regexp engine is
1227 // able to detect and cut short the exponential time behavior on
1228 // this type of match.
1229 //
1230 {
1231 UErrorCode status = U_ZERO_ERROR;
1232 // Enough 'a's in the string to cause the match to time out.
1233 // (Each on additonal 'a' doubles the time)
1234 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1235 RegexMatcher matcher("(a+)+b", testString, 0, status);
1236 REGEX_CHECK_STATUS;
1237 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1238 matcher.setTimeLimit(100, status);
1239 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1240 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1241 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1242 }
1243 {
1244 UErrorCode status = U_ZERO_ERROR;
1245 // Few enough 'a's to slip in under the time limit.
1246 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1247 RegexMatcher matcher("(a+)+b", testString, 0, status);
1248 REGEX_CHECK_STATUS;
1249 matcher.setTimeLimit(100, status);
1250 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1251 REGEX_CHECK_STATUS;
1252 }
57a6839d 1253
46f4442e
A
1254 //
1255 // Stack Limits
1256 //
1257 {
1258 UErrorCode status = U_ZERO_ERROR;
729e4ab9 1259 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
57a6839d 1260
46f4442e
A
1261 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262 // of the '+', and makes the stack frames larger.
1263 RegexMatcher matcher("(A)+A$", testString, 0, status);
57a6839d 1264
46f4442e
A
1265 // With the default stack, this match should fail to run
1266 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1267 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
57a6839d 1268
46f4442e
A
1269 // With unlimited stack, it should run
1270 status = U_ZERO_ERROR;
1271 matcher.setStackLimit(0, status);
1272 REGEX_CHECK_STATUS;
1273 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1274 REGEX_CHECK_STATUS;
1275 REGEX_ASSERT(matcher.getStackLimit() == 0);
1276
1277 // With a limited stack, it the match should fail
1278 status = U_ZERO_ERROR;
1279 matcher.setStackLimit(10000, status);
1280 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1281 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1282 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1283 }
57a6839d 1284
46f4442e
A
1285 // A pattern that doesn't save state should work with
1286 // a minimal sized stack
1287 {
1288 UErrorCode status = U_ZERO_ERROR;
1289 UnicodeString testString = "abc";
1290 RegexMatcher matcher("abc", testString, 0, status);
1291 REGEX_CHECK_STATUS;
1292 matcher.setStackLimit(30, status);
1293 REGEX_CHECK_STATUS;
1294 REGEX_ASSERT(matcher.matches(status) == TRUE);
1295 REGEX_CHECK_STATUS;
1296 REGEX_ASSERT(matcher.getStackLimit() == 30);
57a6839d 1297
46f4442e
A
1298 // Negative stack sizes should fail
1299 status = U_ZERO_ERROR;
1300 matcher.setStackLimit(1000, status);
1301 REGEX_CHECK_STATUS;
1302 matcher.setStackLimit(-1, status);
1303 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1304 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1305 }
57a6839d 1306
46f4442e 1307
b75a7d8f
A
1308}
1309
1310
1311
1312
1313
1314
1315//---------------------------------------------------------------------------
1316//
374ca955 1317// API_Replace API test for class RegexMatcher, testing the
b75a7d8f
A
1318// Replace family of functions.
1319//
1320//---------------------------------------------------------------------------
1321void RegexTest::API_Replace() {
1322 //
1323 // Replace
1324 //
1325 int32_t flags=0;
1326 UParseError pe;
1327 UErrorCode status=U_ZERO_ERROR;
374ca955 1328
b75a7d8f
A
1329 UnicodeString re("abc");
1330 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1331 REGEX_CHECK_STATUS;
1332 UnicodeString data = ".abc..abc...abc..";
1333 // 012345678901234567
1334 RegexMatcher *matcher = pat->matcher(data, status);
374ca955 1335
b75a7d8f
A
1336 //
1337 // Plain vanilla matches.
1338 //
1339 UnicodeString dest;
1340 dest = matcher->replaceFirst("yz", status);
1341 REGEX_CHECK_STATUS;
1342 REGEX_ASSERT(dest == ".yz..abc...abc..");
374ca955 1343
b75a7d8f
A
1344 dest = matcher->replaceAll("yz", status);
1345 REGEX_CHECK_STATUS;
1346 REGEX_ASSERT(dest == ".yz..yz...yz..");
374ca955 1347
b75a7d8f
A
1348 //
1349 // Plain vanilla non-matches.
1350 //
1351 UnicodeString d2 = ".abx..abx...abx..";
1352 matcher->reset(d2);
1353 dest = matcher->replaceFirst("yz", status);
1354 REGEX_CHECK_STATUS;
1355 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1356
b75a7d8f
A
1357 dest = matcher->replaceAll("yz", status);
1358 REGEX_CHECK_STATUS;
1359 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1360
b75a7d8f
A
1361 //
1362 // Empty source string
1363 //
1364 UnicodeString d3 = "";
1365 matcher->reset(d3);
1366 dest = matcher->replaceFirst("yz", status);
1367 REGEX_CHECK_STATUS;
1368 REGEX_ASSERT(dest == "");
374ca955 1369
b75a7d8f
A
1370 dest = matcher->replaceAll("yz", status);
1371 REGEX_CHECK_STATUS;
1372 REGEX_ASSERT(dest == "");
374ca955 1373
b75a7d8f
A
1374 //
1375 // Empty substitution string
1376 //
1377 matcher->reset(data); // ".abc..abc...abc.."
1378 dest = matcher->replaceFirst("", status);
1379 REGEX_CHECK_STATUS;
1380 REGEX_ASSERT(dest == "...abc...abc..");
374ca955 1381
b75a7d8f
A
1382 dest = matcher->replaceAll("", status);
1383 REGEX_CHECK_STATUS;
1384 REGEX_ASSERT(dest == "........");
374ca955 1385
b75a7d8f
A
1386 //
1387 // match whole string
1388 //
1389 UnicodeString d4 = "abc";
374ca955 1390 matcher->reset(d4);
b75a7d8f
A
1391 dest = matcher->replaceFirst("xyz", status);
1392 REGEX_CHECK_STATUS;
1393 REGEX_ASSERT(dest == "xyz");
374ca955 1394
b75a7d8f
A
1395 dest = matcher->replaceAll("xyz", status);
1396 REGEX_CHECK_STATUS;
1397 REGEX_ASSERT(dest == "xyz");
374ca955 1398
b75a7d8f
A
1399 //
1400 // Capture Group, simple case
1401 //
1402 UnicodeString re2("a(..)");
1403 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1404 REGEX_CHECK_STATUS;
1405 UnicodeString d5 = "abcdefg";
1406 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1407 REGEX_CHECK_STATUS;
1408 dest = matcher2->replaceFirst("$1$1", status);
1409 REGEX_CHECK_STATUS;
1410 REGEX_ASSERT(dest == "bcbcdefg");
1411
46f4442e 1412 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
b75a7d8f
A
1413 REGEX_CHECK_STATUS;
1414 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1415
1416 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1417 REGEX_CHECK_STATUS;
1418 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1419
46f4442e 1420 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
b75a7d8f
A
1421 replacement = replacement.unescape();
1422 dest = matcher2->replaceFirst(replacement, status);
1423 REGEX_CHECK_STATUS;
1424 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
374ca955 1425
b75a7d8f 1426 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
1427
1428
1429 //
1430 // Replacement String with \u hex escapes
1431 //
1432 {
1433 UnicodeString src = "abc 1 abc 2 abc 3";
46f4442e 1434 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
374ca955
A
1435 matcher->reset(src);
1436 UnicodeString result = matcher->replaceAll(substitute, status);
1437 REGEX_CHECK_STATUS;
1438 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1439 }
1440 {
1441 UnicodeString src = "abc !";
46f4442e 1442 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
374ca955
A
1443 matcher->reset(src);
1444 UnicodeString result = matcher->replaceAll(substitute, status);
1445 REGEX_CHECK_STATUS;
1446 UnicodeString expected = UnicodeString("--");
1447 expected.append((UChar32)0x10000);
1448 expected.append("-- !");
1449 REGEX_ASSERT(result == expected);
1450 }
b75a7d8f 1451 // TODO: need more through testing of capture substitutions.
374ca955
A
1452
1453 // Bug 4057
1454 //
1455 {
1456 status = U_ZERO_ERROR;
1457 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1458 RegexMatcher m("ss(.*?)ee", 0, status);
1459 REGEX_CHECK_STATUS;
1460 UnicodeString result;
1461
1462 // Multiple finds do NOT bump up the previous appendReplacement postion.
1463 m.reset(s);
1464 m.find();
1465 m.find();
1466 m.appendReplacement(result, "ooh", status);
1467 REGEX_CHECK_STATUS;
1468 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1469
1470 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471 status = U_ZERO_ERROR;
1472 result.truncate(0);
1473 m.reset(10, status);
1474 m.find();
1475 m.find();
1476 m.appendReplacement(result, "ooh", status);
1477 REGEX_CHECK_STATUS;
1478 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1479
1480 // find() at interior of string, appendReplacemnt still starts at beginning.
1481 status = U_ZERO_ERROR;
1482 result.truncate(0);
1483 m.reset();
1484 m.find(10, status);
1485 m.find();
1486 m.appendReplacement(result, "ooh", status);
1487 REGEX_CHECK_STATUS;
1488 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1489
1490 m.appendTail(result);
1491 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1492
1493 }
1494
b75a7d8f
A
1495 delete matcher2;
1496 delete pat2;
1497 delete matcher;
1498 delete pat;
1499}
1500
1501
1502//---------------------------------------------------------------------------
1503//
1504// API_Pattern Test that the API for class RegexPattern is
1505// present and nominally working.
1506//
1507//---------------------------------------------------------------------------
1508void RegexTest::API_Pattern() {
1509 RegexPattern pata; // Test default constructor to not crash.
1510 RegexPattern patb;
1511
1512 REGEX_ASSERT(pata == patb);
1513 REGEX_ASSERT(pata == pata);
1514
1515 UnicodeString re1("abc[a-l][m-z]");
1516 UnicodeString re2("def");
1517 UErrorCode status = U_ZERO_ERROR;
1518 UParseError pe;
1519
1520 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1521 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1522 REGEX_CHECK_STATUS;
1523 REGEX_ASSERT(*pat1 == *pat1);
1524 REGEX_ASSERT(*pat1 != pata);
1525
1526 // Assign
1527 patb = *pat1;
1528 REGEX_ASSERT(patb == *pat1);
1529
1530 // Copy Construct
1531 RegexPattern patc(*pat1);
1532 REGEX_ASSERT(patc == *pat1);
1533 REGEX_ASSERT(patb == patc);
1534 REGEX_ASSERT(pat1 != pat2);
1535 patb = *pat2;
1536 REGEX_ASSERT(patb != patc);
1537 REGEX_ASSERT(patb == *pat2);
1538
1539 // Compile with no flags.
1540 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1541 REGEX_ASSERT(*pat1a == *pat1);
1542
1543 REGEX_ASSERT(pat1a->flags() == 0);
374ca955 1544
b75a7d8f
A
1545 // Compile with different flags should be not equal
1546 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1547 REGEX_CHECK_STATUS;
1548
1549 REGEX_ASSERT(*pat1b != *pat1a);
1550 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1551 REGEX_ASSERT(pat1a->flags() == 0);
1552 delete pat1b;
b75a7d8f
A
1553
1554 // clone
1555 RegexPattern *pat1c = pat1->clone();
1556 REGEX_ASSERT(*pat1c == *pat1);
1557 REGEX_ASSERT(*pat1c != *pat2);
1558
b75a7d8f
A
1559 delete pat1c;
1560 delete pat1a;
1561 delete pat1;
1562 delete pat2;
1563
1564
374ca955
A
1565 //
1566 // Verify that a matcher created from a cloned pattern works.
1567 // (Jitterbug 3423)
1568 //
1569 {
1570 UErrorCode status = U_ZERO_ERROR;
46f4442e 1571 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
374ca955
A
1572 RegexPattern *pClone = pSource->clone();
1573 delete pSource;
1574 RegexMatcher *mFromClone = pClone->matcher(status);
1575 REGEX_CHECK_STATUS;
1576 UnicodeString s = "Hello World";
1577 mFromClone->reset(s);
1578 REGEX_ASSERT(mFromClone->find() == TRUE);
1579 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1580 REGEX_ASSERT(mFromClone->find() == TRUE);
1581 REGEX_ASSERT(mFromClone->group(status) == "World");
1582 REGEX_ASSERT(mFromClone->find() == FALSE);
1583 delete mFromClone;
1584 delete pClone;
1585 }
1586
b75a7d8f
A
1587 //
1588 // matches convenience API
1589 //
1590 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1591 REGEX_CHECK_STATUS;
1592 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1593 REGEX_CHECK_STATUS;
1594 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1595 REGEX_CHECK_STATUS;
1596 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1597 REGEX_CHECK_STATUS;
1598 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1599 REGEX_CHECK_STATUS;
1600 status = U_INDEX_OUTOFBOUNDS_ERROR;
1601 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1602 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1603
1604
1605 //
1606 // Split()
1607 //
1608 status = U_ZERO_ERROR;
1609 pat1 = RegexPattern::compile(" +", pe, status);
1610 REGEX_CHECK_STATUS;
1611 UnicodeString fields[10];
1612
1613 int32_t n;
1614 n = pat1->split("Now is the time", fields, 10, status);
1615 REGEX_CHECK_STATUS;
1616 REGEX_ASSERT(n==4);
1617 REGEX_ASSERT(fields[0]=="Now");
1618 REGEX_ASSERT(fields[1]=="is");
1619 REGEX_ASSERT(fields[2]=="the");
1620 REGEX_ASSERT(fields[3]=="time");
1621 REGEX_ASSERT(fields[4]=="");
1622
1623 n = pat1->split("Now is the time", fields, 2, status);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(n==2);
1626 REGEX_ASSERT(fields[0]=="Now");
1627 REGEX_ASSERT(fields[1]=="is the time");
1628 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1629
1630 fields[1] = "*";
1631 status = U_ZERO_ERROR;
1632 n = pat1->split("Now is the time", fields, 1, status);
1633 REGEX_CHECK_STATUS;
1634 REGEX_ASSERT(n==1);
1635 REGEX_ASSERT(fields[0]=="Now is the time");
1636 REGEX_ASSERT(fields[1]=="*");
1637 status = U_ZERO_ERROR;
1638
1639 n = pat1->split(" Now is the time ", fields, 10, status);
1640 REGEX_CHECK_STATUS;
4388f060 1641 REGEX_ASSERT(n==6);
b75a7d8f
A
1642 REGEX_ASSERT(fields[0]=="");
1643 REGEX_ASSERT(fields[1]=="Now");
1644 REGEX_ASSERT(fields[2]=="is");
1645 REGEX_ASSERT(fields[3]=="the");
1646 REGEX_ASSERT(fields[4]=="time");
1647 REGEX_ASSERT(fields[5]=="");
1648
1649 n = pat1->split(" ", fields, 10, status);
1650 REGEX_CHECK_STATUS;
4388f060 1651 REGEX_ASSERT(n==2);
b75a7d8f 1652 REGEX_ASSERT(fields[0]=="");
4388f060 1653 REGEX_ASSERT(fields[1]=="");
b75a7d8f
A
1654
1655 fields[0] = "foo";
1656 n = pat1->split("", fields, 10, status);
1657 REGEX_CHECK_STATUS;
1658 REGEX_ASSERT(n==0);
1659 REGEX_ASSERT(fields[0]=="foo");
1660
1661 delete pat1;
1662
1663 // split, with a pattern with (capture)
46f4442e 1664 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
b75a7d8f
A
1665 REGEX_CHECK_STATUS;
1666
1667 status = U_ZERO_ERROR;
1668 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1669 REGEX_CHECK_STATUS;
4388f060 1670 REGEX_ASSERT(n==7);
b75a7d8f
A
1671 REGEX_ASSERT(fields[0]=="");
1672 REGEX_ASSERT(fields[1]=="a");
1673 REGEX_ASSERT(fields[2]=="Now is ");
1674 REGEX_ASSERT(fields[3]=="b");
1675 REGEX_ASSERT(fields[4]=="the time");
1676 REGEX_ASSERT(fields[5]=="c");
1677 REGEX_ASSERT(fields[6]=="");
1678 REGEX_ASSERT(status==U_ZERO_ERROR);
1679
1680 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1681 REGEX_CHECK_STATUS;
4388f060 1682 REGEX_ASSERT(n==7);
b75a7d8f
A
1683 REGEX_ASSERT(fields[0]==" ");
1684 REGEX_ASSERT(fields[1]=="a");
1685 REGEX_ASSERT(fields[2]=="Now is ");
1686 REGEX_ASSERT(fields[3]=="b");
1687 REGEX_ASSERT(fields[4]=="the time");
1688 REGEX_ASSERT(fields[5]=="c");
1689 REGEX_ASSERT(fields[6]=="");
1690
1691 status = U_ZERO_ERROR;
1692 fields[6] = "foo";
1693 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1694 REGEX_CHECK_STATUS;
1695 REGEX_ASSERT(n==6);
1696 REGEX_ASSERT(fields[0]==" ");
1697 REGEX_ASSERT(fields[1]=="a");
1698 REGEX_ASSERT(fields[2]=="Now is ");
1699 REGEX_ASSERT(fields[3]=="b");
1700 REGEX_ASSERT(fields[4]=="the time");
4388f060 1701 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
b75a7d8f
A
1702 REGEX_ASSERT(fields[6]=="foo");
1703
1704 status = U_ZERO_ERROR;
1705 fields[5] = "foo";
1706 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1707 REGEX_CHECK_STATUS;
1708 REGEX_ASSERT(n==5);
1709 REGEX_ASSERT(fields[0]==" ");
1710 REGEX_ASSERT(fields[1]=="a");
1711 REGEX_ASSERT(fields[2]=="Now is ");
1712 REGEX_ASSERT(fields[3]=="b");
1713 REGEX_ASSERT(fields[4]=="the time<c>");
1714 REGEX_ASSERT(fields[5]=="foo");
1715
1716 status = U_ZERO_ERROR;
1717 fields[5] = "foo";
1718 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1719 REGEX_CHECK_STATUS;
1720 REGEX_ASSERT(n==5);
1721 REGEX_ASSERT(fields[0]==" ");
1722 REGEX_ASSERT(fields[1]=="a");
1723 REGEX_ASSERT(fields[2]=="Now is ");
1724 REGEX_ASSERT(fields[3]=="b");
1725 REGEX_ASSERT(fields[4]=="the time");
1726 REGEX_ASSERT(fields[5]=="foo");
1727
1728 status = U_ZERO_ERROR;
1729 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1730 REGEX_CHECK_STATUS;
1731 REGEX_ASSERT(n==4);
1732 REGEX_ASSERT(fields[0]==" ");
1733 REGEX_ASSERT(fields[1]=="a");
1734 REGEX_ASSERT(fields[2]=="Now is ");
1735 REGEX_ASSERT(fields[3]=="the time<c>");
1736 status = U_ZERO_ERROR;
1737 delete pat1;
1738
1739 pat1 = RegexPattern::compile("([-,])", pe, status);
1740 REGEX_CHECK_STATUS;
1741 n = pat1->split("1-10,20", fields, 10, status);
1742 REGEX_CHECK_STATUS;
1743 REGEX_ASSERT(n==5);
1744 REGEX_ASSERT(fields[0]=="1");
1745 REGEX_ASSERT(fields[1]=="-");
1746 REGEX_ASSERT(fields[2]=="10");
1747 REGEX_ASSERT(fields[3]==",");
1748 REGEX_ASSERT(fields[4]=="20");
1749 delete pat1;
1750
4388f060
A
1751 // Test split of string with empty trailing fields
1752 pat1 = RegexPattern::compile(",", pe, status);
1753 REGEX_CHECK_STATUS;
1754 n = pat1->split("a,b,c,", fields, 10, status);
1755 REGEX_CHECK_STATUS;
1756 REGEX_ASSERT(n==4);
1757 REGEX_ASSERT(fields[0]=="a");
1758 REGEX_ASSERT(fields[1]=="b");
1759 REGEX_ASSERT(fields[2]=="c");
1760 REGEX_ASSERT(fields[3]=="");
1761
1762 n = pat1->split("a,,,", fields, 10, status);
1763 REGEX_CHECK_STATUS;
1764 REGEX_ASSERT(n==4);
1765 REGEX_ASSERT(fields[0]=="a");
1766 REGEX_ASSERT(fields[1]=="");
1767 REGEX_ASSERT(fields[2]=="");
1768 REGEX_ASSERT(fields[3]=="");
1769 delete pat1;
1770
1771 // Split Separator with zero length match.
1772 pat1 = RegexPattern::compile(":?", pe, status);
1773 REGEX_CHECK_STATUS;
1774 n = pat1->split("abc", fields, 10, status);
1775 REGEX_CHECK_STATUS;
1776 REGEX_ASSERT(n==5);
1777 REGEX_ASSERT(fields[0]=="");
1778 REGEX_ASSERT(fields[1]=="a");
1779 REGEX_ASSERT(fields[2]=="b");
1780 REGEX_ASSERT(fields[3]=="c");
1781 REGEX_ASSERT(fields[4]=="");
1782
1783 delete pat1;
b75a7d8f
A
1784
1785 //
1786 // RegexPattern::pattern()
1787 //
1788 pat1 = new RegexPattern();
1789 REGEX_ASSERT(pat1->pattern() == "");
1790 delete pat1;
1791
1792 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1793 REGEX_CHECK_STATUS;
1794 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1795 delete pat1;
1796
1797
1798 //
1799 // classID functions
1800 //
1801 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1802 REGEX_CHECK_STATUS;
1803 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1804 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
374ca955
A
1805 UnicodeString Hello("Hello, world.");
1806 RegexMatcher *m = pat1->matcher(Hello, status);
b75a7d8f
A
1807 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1808 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1810 delete m;
1811 delete pat1;
1812
1813}
1814
1815//---------------------------------------------------------------------------
1816//
729e4ab9
A
1817// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1818// is present and working, but excluding functions
1819// implementing replace operations.
b75a7d8f
A
1820//
1821//---------------------------------------------------------------------------
729e4ab9
A
1822void RegexTest::API_Match_UTF8() {
1823 UParseError pe;
1824 UErrorCode status=U_ZERO_ERROR;
1825 int32_t flags = 0;
b75a7d8f
A
1826
1827 //
729e4ab9 1828 // Debug - slide failing test cases early
b75a7d8f 1829 //
729e4ab9
A
1830#if 0
1831 {
374ca955 1832 }
729e4ab9
A
1833 return;
1834#endif
b75a7d8f
A
1835
1836 //
729e4ab9 1837 // Simple pattern compilation
b75a7d8f 1838 //
729e4ab9
A
1839 {
1840 UText re = UTEXT_INITIALIZER;
1841 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
4388f060 1842 REGEX_VERBOSE_TEXT(&re);
729e4ab9
A
1843 RegexPattern *pat2;
1844 pat2 = RegexPattern::compile(&re, flags, pe, status);
1845 REGEX_CHECK_STATUS;
b75a7d8f 1846
729e4ab9
A
1847 UText input1 = UTEXT_INITIALIZER;
1848 UText input2 = UTEXT_INITIALIZER;
1849 UText empty = UTEXT_INITIALIZER;
1850 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1851 REGEX_VERBOSE_TEXT(&input1);
1852 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1853 REGEX_VERBOSE_TEXT(&input2);
1854 utext_openUChars(&empty, NULL, 0, &status);
57a6839d 1855
729e4ab9
A
1856 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1857 int32_t input2Len = strlen("not abc");
b75a7d8f 1858
b75a7d8f 1859
729e4ab9
A
1860 //
1861 // Matcher creation and reset.
1862 //
4388f060 1863 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
729e4ab9
A
1864 REGEX_CHECK_STATUS;
1865 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1866 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1868 m1->reset(&input2);
1869 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1870 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1872 m1->reset(&input1);
1873 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1874 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1875 m1->reset(&empty);
1876 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1877 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
b75a7d8f 1878
729e4ab9
A
1879 //
1880 // reset(pos, status)
1881 //
1882 m1->reset(&input1);
1883 m1->reset(4, status);
1884 REGEX_CHECK_STATUS;
1885 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1886 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f 1887
729e4ab9
A
1888 m1->reset(-1, status);
1889 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f 1890 status = U_ZERO_ERROR;
b75a7d8f 1891
729e4ab9
A
1892 m1->reset(0, status);
1893 REGEX_CHECK_STATUS;
1894 status = U_ZERO_ERROR;
b75a7d8f 1895
729e4ab9
A
1896 m1->reset(input1Len-1, status);
1897 REGEX_CHECK_STATUS;
1898 status = U_ZERO_ERROR;
1899
1900 m1->reset(input1Len, status);
1901 REGEX_CHECK_STATUS;
1902 status = U_ZERO_ERROR;
1903
1904 m1->reset(input1Len+1, status);
1905 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1906 status = U_ZERO_ERROR;
b75a7d8f
A
1907
1908 //
729e4ab9 1909 // match(pos, status)
b75a7d8f 1910 //
729e4ab9
A
1911 m1->reset(&input2);
1912 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913 m1->reset();
1914 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1915 m1->reset();
1916 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1917 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1918 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921 // Match() at end of string should fail, but should not
1922 // be an error.
1923 status = U_ZERO_ERROR;
1924 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1925 REGEX_CHECK_STATUS;
1926
1927 // Match beyond end of string should fail with an error.
1928 status = U_ZERO_ERROR;
1929 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1930 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932 // Successful match at end of string.
1933 {
1934 status = U_ZERO_ERROR;
1935 RegexMatcher m("A?", 0, status); // will match zero length string.
1936 REGEX_CHECK_STATUS;
1937 m.reset(&input1);
1938 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1939 REGEX_CHECK_STATUS;
1940 m.reset(&empty);
1941 REGEX_ASSERT(m.matches(0, status) == TRUE);
1942 REGEX_CHECK_STATUS;
b75a7d8f
A
1943 }
1944
1945
1946 //
729e4ab9 1947 // lookingAt(pos, status)
b75a7d8f 1948 //
729e4ab9
A
1949 status = U_ZERO_ERROR;
1950 m1->reset(&input2); // "not abc"
1951 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1952 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1953 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1954 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1955 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1956 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957 status = U_ZERO_ERROR;
1958 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1959 REGEX_CHECK_STATUS;
1960 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1961 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1962
1963 delete m1;
1964 delete pat2;
57a6839d 1965
729e4ab9
A
1966 utext_close(&re);
1967 utext_close(&input1);
1968 utext_close(&input2);
1969 utext_close(&empty);
1970 }
1971
1972
1973 //
1974 // Capture Group.
1975 // RegexMatcher::start();
1976 // RegexMatcher::end();
1977 // RegexMatcher::groupCount();
1978 //
1979 {
1980 int32_t flags=0;
1981 UParseError pe;
1982 UErrorCode status=U_ZERO_ERROR;
1983 UText re=UTEXT_INITIALIZER;
1984 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985 utext_openUTF8(&re, str_01234567_pat, -1, &status);
57a6839d 1986
729e4ab9
A
1987 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1988 REGEX_CHECK_STATUS;
57a6839d 1989
729e4ab9
A
1990 UText input = UTEXT_INITIALIZER;
1991 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992 utext_openUTF8(&input, str_0123456789, -1, &status);
1993
4388f060 1994 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
1995 REGEX_CHECK_STATUS;
1996 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1997 static const int32_t matchStarts[] = {0, 2, 4, 8};
1998 static const int32_t matchEnds[] = {10, 8, 6, 10};
1999 int32_t i;
2000 for (i=0; i<4; i++) {
2001 int32_t actualStart = matcher->start(i, status);
2002 REGEX_CHECK_STATUS;
2003 if (actualStart != matchStarts[i]) {
2004 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2005 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2006 }
2007 int32_t actualEnd = matcher->end(i, status);
2008 REGEX_CHECK_STATUS;
2009 if (actualEnd != matchEnds[i]) {
2010 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2011 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2012 }
2013 }
2014
2015 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2016 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2017
2018 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2019 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020 matcher->reset();
2021 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2022
2023 matcher->lookingAt(status);
57a6839d 2024
729e4ab9
A
2025 UnicodeString dest;
2026 UText destText = UTEXT_INITIALIZER;
2027 utext_openUnicodeString(&destText, &dest, &status);
2028 UText *result;
2029 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030 // Test shallow-clone API
2031 int64_t group_len;
2032 result = matcher->group((UText *)NULL, group_len, status);
2033 REGEX_CHECK_STATUS;
2034 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035 utext_close(result);
2036 result = matcher->group(0, &destText, group_len, status);
2037 REGEX_CHECK_STATUS;
2038 REGEX_ASSERT(result == &destText);
2039 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2040 // destText is now immutable, reopen it
2041 utext_close(&destText);
2042 utext_openUnicodeString(&destText, &dest, &status);
57a6839d 2043
729e4ab9
A
2044 result = matcher->group(0, NULL, status);
2045 REGEX_CHECK_STATUS;
2046 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047 utext_close(result);
2048 result = matcher->group(0, &destText, status);
2049 REGEX_CHECK_STATUS;
2050 REGEX_ASSERT(result == &destText);
2051 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
57a6839d 2052
729e4ab9
A
2053 result = matcher->group(1, NULL, status);
2054 REGEX_CHECK_STATUS;
2055 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2056 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057 utext_close(result);
2058 result = matcher->group(1, &destText, status);
2059 REGEX_CHECK_STATUS;
2060 REGEX_ASSERT(result == &destText);
2061 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
57a6839d 2062
729e4ab9
A
2063 result = matcher->group(2, NULL, status);
2064 REGEX_CHECK_STATUS;
2065 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2066 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067 utext_close(result);
2068 result = matcher->group(2, &destText, status);
2069 REGEX_CHECK_STATUS;
2070 REGEX_ASSERT(result == &destText);
2071 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
57a6839d 2072
729e4ab9
A
2073 result = matcher->group(3, NULL, status);
2074 REGEX_CHECK_STATUS;
2075 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2076 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077 utext_close(result);
2078 result = matcher->group(3, &destText, status);
2079 REGEX_CHECK_STATUS;
2080 REGEX_ASSERT(result == &destText);
2081 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2082
2083 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2084 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2085 matcher->reset();
2086 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2087
2088 delete matcher;
2089 delete pat;
57a6839d 2090
729e4ab9
A
2091 utext_close(&destText);
2092 utext_close(&input);
2093 utext_close(&re);
2094 }
2095
2096 //
2097 // find
2098 //
2099 {
2100 int32_t flags=0;
2101 UParseError pe;
2102 UErrorCode status=U_ZERO_ERROR;
2103 UText re=UTEXT_INITIALIZER;
2104 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2105 utext_openUTF8(&re, str_abc, -1, &status);
2106
2107 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2108 REGEX_CHECK_STATUS;
2109 UText input = UTEXT_INITIALIZER;
2110 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2111 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2112 // 012345678901234567
2113
4388f060 2114 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
2115 REGEX_CHECK_STATUS;
2116 REGEX_ASSERT(matcher->find());
2117 REGEX_ASSERT(matcher->start(status) == 1);
2118 REGEX_ASSERT(matcher->find());
2119 REGEX_ASSERT(matcher->start(status) == 6);
2120 REGEX_ASSERT(matcher->find());
2121 REGEX_ASSERT(matcher->start(status) == 12);
2122 REGEX_ASSERT(matcher->find() == FALSE);
2123 REGEX_ASSERT(matcher->find() == FALSE);
2124
2125 matcher->reset();
2126 REGEX_ASSERT(matcher->find());
2127 REGEX_ASSERT(matcher->start(status) == 1);
2128
2129 REGEX_ASSERT(matcher->find(0, status));
2130 REGEX_ASSERT(matcher->start(status) == 1);
2131 REGEX_ASSERT(matcher->find(1, status));
2132 REGEX_ASSERT(matcher->start(status) == 1);
2133 REGEX_ASSERT(matcher->find(2, status));
2134 REGEX_ASSERT(matcher->start(status) == 6);
2135 REGEX_ASSERT(matcher->find(12, status));
2136 REGEX_ASSERT(matcher->start(status) == 12);
2137 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2138 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2139 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2140 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2141
2142 status = U_ZERO_ERROR;
2143 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2144 status = U_ZERO_ERROR;
2145 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2146
2147 REGEX_ASSERT(matcher->groupCount() == 0);
2148
2149 delete matcher;
2150 delete pat;
57a6839d 2151
729e4ab9
A
2152 utext_close(&input);
2153 utext_close(&re);
2154 }
2155
2156
2157 //
2158 // find, with \G in pattern (true if at the end of a previous match).
2159 //
2160 {
2161 int32_t flags=0;
2162 UParseError pe;
2163 UErrorCode status=U_ZERO_ERROR;
2164 UText re=UTEXT_INITIALIZER;
2165 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2166 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2167
2168 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
57a6839d 2169
729e4ab9
A
2170 REGEX_CHECK_STATUS;
2171 UText input = UTEXT_INITIALIZER;
2172 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2173 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2174 // 012345678901234567
2175
4388f060 2176 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
729e4ab9
A
2177 REGEX_CHECK_STATUS;
2178 REGEX_ASSERT(matcher->find());
2179 REGEX_ASSERT(matcher->start(status) == 0);
2180 REGEX_ASSERT(matcher->start(1, status) == -1);
2181 REGEX_ASSERT(matcher->start(2, status) == 1);
2182
2183 REGEX_ASSERT(matcher->find());
2184 REGEX_ASSERT(matcher->start(status) == 4);
2185 REGEX_ASSERT(matcher->start(1, status) == 4);
2186 REGEX_ASSERT(matcher->start(2, status) == -1);
2187 REGEX_CHECK_STATUS;
2188
2189 delete matcher;
2190 delete pat;
57a6839d 2191
729e4ab9
A
2192 utext_close(&input);
2193 utext_close(&re);
2194 }
2195
2196 //
2197 // find with zero length matches, match position should bump ahead
2198 // to prevent loops.
2199 //
2200 {
2201 int32_t i;
2202 UErrorCode status=U_ZERO_ERROR;
2203 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2204 // using an always-true look-ahead.
2205 REGEX_CHECK_STATUS;
2206 UText s = UTEXT_INITIALIZER;
2207 utext_openUTF8(&s, " ", -1, &status);
2208 m.reset(&s);
2209 for (i=0; ; i++) {
2210 if (m.find() == FALSE) {
2211 break;
2212 }
2213 REGEX_ASSERT(m.start(status) == i);
2214 REGEX_ASSERT(m.end(status) == i);
2215 }
2216 REGEX_ASSERT(i==5);
2217
2218 // Check that the bump goes over characters outside the BMP OK
2219 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2220 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2221 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2222 m.reset(&s);
2223 for (i=0; ; i+=4) {
2224 if (m.find() == FALSE) {
2225 break;
2226 }
2227 REGEX_ASSERT(m.start(status) == i);
2228 REGEX_ASSERT(m.end(status) == i);
2229 }
2230 REGEX_ASSERT(i==20);
57a6839d 2231
729e4ab9
A
2232 utext_close(&s);
2233 }
2234 {
2235 // find() loop breaking test.
2236 // with pattern of /.?/, should see a series of one char matches, then a single
2237 // match of zero length at the end of the input string.
2238 int32_t i;
2239 UErrorCode status=U_ZERO_ERROR;
2240 RegexMatcher m(".?", 0, status);
2241 REGEX_CHECK_STATUS;
2242 UText s = UTEXT_INITIALIZER;
2243 utext_openUTF8(&s, " ", -1, &status);
2244 m.reset(&s);
2245 for (i=0; ; i++) {
2246 if (m.find() == FALSE) {
2247 break;
2248 }
2249 REGEX_ASSERT(m.start(status) == i);
2250 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2251 }
2252 REGEX_ASSERT(i==5);
57a6839d 2253
729e4ab9
A
2254 utext_close(&s);
2255 }
2256
2257
2258 //
2259 // Matchers with no input string behave as if they had an empty input string.
2260 //
2261
2262 {
2263 UErrorCode status = U_ZERO_ERROR;
2264 RegexMatcher m(".?", 0, status);
2265 REGEX_CHECK_STATUS;
2266 REGEX_ASSERT(m.find());
2267 REGEX_ASSERT(m.start(status) == 0);
2268 REGEX_ASSERT(m.input() == "");
2269 }
2270 {
2271 UErrorCode status = U_ZERO_ERROR;
2272 RegexPattern *p = RegexPattern::compile(".", 0, status);
2273 RegexMatcher *m = p->matcher(status);
2274 REGEX_CHECK_STATUS;
2275
2276 REGEX_ASSERT(m->find() == FALSE);
2277 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2278 delete m;
2279 delete p;
2280 }
57a6839d 2281
729e4ab9
A
2282 //
2283 // Regions
2284 //
2285 {
2286 UErrorCode status = U_ZERO_ERROR;
2287 UText testPattern = UTEXT_INITIALIZER;
2288 UText testText = UTEXT_INITIALIZER;
2289 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2290 REGEX_VERBOSE_TEXT(&testPattern);
2291 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2292 REGEX_VERBOSE_TEXT(&testText);
57a6839d 2293
729e4ab9
A
2294 RegexMatcher m(&testPattern, &testText, 0, status);
2295 REGEX_CHECK_STATUS;
2296 REGEX_ASSERT(m.regionStart() == 0);
2297 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2298 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2299 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 2300
729e4ab9
A
2301 m.region(2,4, status);
2302 REGEX_CHECK_STATUS;
2303 REGEX_ASSERT(m.matches(status));
2304 REGEX_ASSERT(m.start(status)==2);
2305 REGEX_ASSERT(m.end(status)==4);
2306 REGEX_CHECK_STATUS;
57a6839d 2307
729e4ab9
A
2308 m.reset();
2309 REGEX_ASSERT(m.regionStart() == 0);
2310 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
57a6839d 2311
729e4ab9
A
2312 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2313 REGEX_VERBOSE_TEXT(&testText);
2314 m.reset(&testText);
2315 REGEX_ASSERT(m.regionStart() == 0);
2316 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
57a6839d 2317
729e4ab9
A
2318 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2319 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2320 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2321 REGEX_ASSERT(&m == &m.reset());
2322 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
57a6839d 2323
729e4ab9
A
2324 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2325 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326 REGEX_ASSERT(&m == &m.reset());
2327 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
57a6839d 2328
729e4ab9
A
2329 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2330 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2331 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2332 REGEX_ASSERT(&m == &m.reset());
2333 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2334
2335 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2336 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2337 REGEX_ASSERT(&m == &m.reset());
2338 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
57a6839d 2339
729e4ab9
A
2340 utext_close(&testText);
2341 utext_close(&testPattern);
2342 }
57a6839d 2343
729e4ab9
A
2344 //
2345 // hitEnd() and requireEnd()
2346 //
2347 {
2348 UErrorCode status = U_ZERO_ERROR;
2349 UText testPattern = UTEXT_INITIALIZER;
2350 UText testText = UTEXT_INITIALIZER;
2351 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2352 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2353 utext_openUTF8(&testPattern, str_, -1, &status);
2354 utext_openUTF8(&testText, str_aabb, -1, &status);
57a6839d 2355
729e4ab9
A
2356 RegexMatcher m1(&testPattern, &testText, 0, status);
2357 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2358 REGEX_ASSERT(m1.hitEnd() == TRUE);
2359 REGEX_ASSERT(m1.requireEnd() == FALSE);
2360 REGEX_CHECK_STATUS;
57a6839d 2361
729e4ab9
A
2362 status = U_ZERO_ERROR;
2363 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2364 utext_openUTF8(&testPattern, str_a, -1, &status);
2365 RegexMatcher m2(&testPattern, &testText, 0, status);
2366 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2367 REGEX_ASSERT(m2.hitEnd() == FALSE);
2368 REGEX_ASSERT(m2.requireEnd() == FALSE);
2369 REGEX_CHECK_STATUS;
2370
2371 status = U_ZERO_ERROR;
2372 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2373 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2374 RegexMatcher m3(&testPattern, &testText, 0, status);
2375 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2376 REGEX_ASSERT(m3.hitEnd() == TRUE);
2377 REGEX_ASSERT(m3.requireEnd() == TRUE);
2378 REGEX_CHECK_STATUS;
57a6839d 2379
729e4ab9
A
2380 utext_close(&testText);
2381 utext_close(&testPattern);
2382 }
2383}
2384
2385
2386//---------------------------------------------------------------------------
2387//
2388// API_Replace_UTF8 API test for class RegexMatcher, testing the
2389// Replace family of functions.
2390//
2391//---------------------------------------------------------------------------
2392void RegexTest::API_Replace_UTF8() {
2393 //
2394 // Replace
2395 //
2396 int32_t flags=0;
2397 UParseError pe;
2398 UErrorCode status=U_ZERO_ERROR;
2399
2400 UText re=UTEXT_INITIALIZER;
2401 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2402 REGEX_VERBOSE_TEXT(&re);
2403 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2404 REGEX_CHECK_STATUS;
57a6839d 2405
729e4ab9
A
2406 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2407 // 012345678901234567
2408 UText dataText = UTEXT_INITIALIZER;
2409 utext_openUTF8(&dataText, data, -1, &status);
2410 REGEX_CHECK_STATUS;
2411 REGEX_VERBOSE_TEXT(&dataText);
4388f060 2412 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
729e4ab9
A
2413
2414 //
2415 // Plain vanilla matches.
2416 //
2417 UnicodeString dest;
2418 UText destText = UTEXT_INITIALIZER;
2419 utext_openUnicodeString(&destText, &dest, &status);
2420 UText *result;
57a6839d 2421
729e4ab9 2422 UText replText = UTEXT_INITIALIZER;
57a6839d 2423
729e4ab9
A
2424 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2425 utext_openUTF8(&replText, str_yz, -1, &status);
2426 REGEX_VERBOSE_TEXT(&replText);
2427 result = matcher->replaceFirst(&replText, NULL, status);
2428 REGEX_CHECK_STATUS;
2429 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2430 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431 utext_close(result);
2432 result = matcher->replaceFirst(&replText, &destText, status);
2433 REGEX_CHECK_STATUS;
2434 REGEX_ASSERT(result == &destText);
2435 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2436
2437 result = matcher->replaceAll(&replText, NULL, status);
2438 REGEX_CHECK_STATUS;
2439 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2440 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2441 utext_close(result);
2442
2443 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2444 result = matcher->replaceAll(&replText, &destText, status);
2445 REGEX_CHECK_STATUS;
2446 REGEX_ASSERT(result == &destText);
2447 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2448
2449 //
2450 // Plain vanilla non-matches.
2451 //
2452 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2453 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2454 matcher->reset(&dataText);
57a6839d 2455
729e4ab9
A
2456 result = matcher->replaceFirst(&replText, NULL, status);
2457 REGEX_CHECK_STATUS;
2458 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459 utext_close(result);
2460 result = matcher->replaceFirst(&replText, &destText, status);
2461 REGEX_CHECK_STATUS;
2462 REGEX_ASSERT(result == &destText);
2463 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2464
2465 result = matcher->replaceAll(&replText, NULL, status);
2466 REGEX_CHECK_STATUS;
2467 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2468 utext_close(result);
2469 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470 result = matcher->replaceAll(&replText, &destText, status);
2471 REGEX_CHECK_STATUS;
2472 REGEX_ASSERT(result == &destText);
2473 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2474
2475 //
2476 // Empty source string
2477 //
2478 utext_openUTF8(&dataText, NULL, 0, &status);
2479 matcher->reset(&dataText);
57a6839d 2480
729e4ab9
A
2481 result = matcher->replaceFirst(&replText, NULL, status);
2482 REGEX_CHECK_STATUS;
2483 REGEX_ASSERT_UTEXT_UTF8("", result);
2484 utext_close(result);
2485 result = matcher->replaceFirst(&replText, &destText, status);
2486 REGEX_CHECK_STATUS;
2487 REGEX_ASSERT(result == &destText);
2488 REGEX_ASSERT_UTEXT_UTF8("", result);
2489
2490 result = matcher->replaceAll(&replText, NULL, status);
2491 REGEX_CHECK_STATUS;
2492 REGEX_ASSERT_UTEXT_UTF8("", result);
2493 utext_close(result);
2494 result = matcher->replaceAll(&replText, &destText, status);
2495 REGEX_CHECK_STATUS;
2496 REGEX_ASSERT(result == &destText);
2497 REGEX_ASSERT_UTEXT_UTF8("", result);
2498
2499 //
2500 // Empty substitution string
2501 //
2502 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2503 matcher->reset(&dataText);
57a6839d 2504
729e4ab9
A
2505 utext_openUTF8(&replText, NULL, 0, &status);
2506 result = matcher->replaceFirst(&replText, NULL, status);
2507 REGEX_CHECK_STATUS;
2508 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2509 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510 utext_close(result);
2511 result = matcher->replaceFirst(&replText, &destText, status);
2512 REGEX_CHECK_STATUS;
2513 REGEX_ASSERT(result == &destText);
2514 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2515
2516 result = matcher->replaceAll(&replText, NULL, status);
2517 REGEX_CHECK_STATUS;
2518 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2519 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2520 utext_close(result);
2521 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522 result = matcher->replaceAll(&replText, &destText, status);
2523 REGEX_CHECK_STATUS;
2524 REGEX_ASSERT(result == &destText);
2525 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2526
2527 //
2528 // match whole string
2529 //
2530 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2531 utext_openUTF8(&dataText, str_abc, -1, &status);
2532 matcher->reset(&dataText);
2533
2534 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2535 utext_openUTF8(&replText, str_xyz, -1, &status);
2536 result = matcher->replaceFirst(&replText, NULL, status);
2537 REGEX_CHECK_STATUS;
2538 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539 utext_close(result);
2540 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2541 result = matcher->replaceFirst(&replText, &destText, status);
2542 REGEX_CHECK_STATUS;
2543 REGEX_ASSERT(result == &destText);
2544 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2545
2546 result = matcher->replaceAll(&replText, NULL, status);
2547 REGEX_CHECK_STATUS;
2548 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2549 utext_close(result);
2550 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2551 result = matcher->replaceAll(&replText, &destText, status);
2552 REGEX_CHECK_STATUS;
2553 REGEX_ASSERT(result == &destText);
2554 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2555
2556 //
2557 // Capture Group, simple case
2558 //
2559 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2560 utext_openUTF8(&re, str_add, -1, &status);
2561 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2562 REGEX_CHECK_STATUS;
2563
2564 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2565 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
4388f060 2566 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
729e4ab9 2567 REGEX_CHECK_STATUS;
57a6839d 2568
729e4ab9
A
2569 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2570 utext_openUTF8(&replText, str_11, -1, &status);
2571 result = matcher2->replaceFirst(&replText, NULL, status);
2572 REGEX_CHECK_STATUS;
2573 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2574 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2575 utext_close(result);
2576 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577 result = matcher2->replaceFirst(&replText, &destText, status);
2578 REGEX_CHECK_STATUS;
2579 REGEX_ASSERT(result == &destText);
2580 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
57a6839d
A
2581
2582 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
4388f060
A
2583 utext_openUTF8(&replText, str_v, -1, &status);
2584 REGEX_VERBOSE_TEXT(&replText);
729e4ab9
A
2585 result = matcher2->replaceFirst(&replText, NULL, status);
2586 REGEX_CHECK_STATUS;
2587 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2588 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2589 utext_close(result);
2590 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2591 result = matcher2->replaceFirst(&replText, &destText, status);
2592 REGEX_CHECK_STATUS;
2593 REGEX_ASSERT(result == &destText);
2594 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
57a6839d 2595
729e4ab9
A
2596 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2597 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2598 result = matcher2->replaceFirst(&replText, NULL, status);
2599 REGEX_CHECK_STATUS;
2600 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2601 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602 utext_close(result);
2603 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2604 result = matcher2->replaceFirst(&replText, &destText, status);
2605 REGEX_CHECK_STATUS;
2606 REGEX_ASSERT(result == &destText);
2607 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2608
2609 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2610 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2611 // 012345678901234567890123456
2612 supplDigitChars[22] = 0xF0;
2613 supplDigitChars[23] = 0x9D;
2614 supplDigitChars[24] = 0x9F;
2615 supplDigitChars[25] = 0x8F;
2616 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
57a6839d 2617
729e4ab9
A
2618 result = matcher2->replaceFirst(&replText, NULL, status);
2619 REGEX_CHECK_STATUS;
2620 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2621 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622 utext_close(result);
2623 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2624 result = matcher2->replaceFirst(&replText, &destText, status);
2625 REGEX_CHECK_STATUS;
2626 REGEX_ASSERT(result == &destText);
2627 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2628 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2629 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2630 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2631// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632 utext_close(result);
2633 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2634 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2635 REGEX_ASSERT(result == &destText);
2636// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2637
2638 //
2639 // Replacement String with \u hex escapes
2640 //
2641 {
2642 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2643 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2644 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2645 utext_openUTF8(&replText, str_u0043, -1, &status);
2646 matcher->reset(&dataText);
57a6839d 2647
729e4ab9
A
2648 result = matcher->replaceAll(&replText, NULL, status);
2649 REGEX_CHECK_STATUS;
2650 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2651 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652 utext_close(result);
2653 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2654 result = matcher->replaceAll(&replText, &destText, status);
2655 REGEX_CHECK_STATUS;
2656 REGEX_ASSERT(result == &destText);
2657 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2658 }
2659 {
2660 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2661 utext_openUTF8(&dataText, str_abc, -1, &status);
2662 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2663 utext_openUTF8(&replText, str_U00010000, -1, &status);
2664 matcher->reset(&dataText);
2665
2666 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
57a6839d 2667 // 0123456789
729e4ab9
A
2668 expected[2] = 0xF0;
2669 expected[3] = 0x90;
2670 expected[4] = 0x80;
2671 expected[5] = 0x80;
2672
2673 result = matcher->replaceAll(&replText, NULL, status);
2674 REGEX_CHECK_STATUS;
2675 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676 utext_close(result);
2677 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678 result = matcher->replaceAll(&replText, &destText, status);
2679 REGEX_CHECK_STATUS;
2680 REGEX_ASSERT(result == &destText);
2681 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2682 }
2683 // TODO: need more through testing of capture substitutions.
2684
2685 // Bug 4057
2686 //
2687 {
2688 status = U_ZERO_ERROR;
2689const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2690const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2691const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2692 utext_openUTF8(&re, str_ssee, -1, &status);
2693 utext_openUTF8(&dataText, str_blah, -1, &status);
2694 utext_openUTF8(&replText, str_ooh, -1, &status);
57a6839d 2695
729e4ab9
A
2696 RegexMatcher m(&re, 0, status);
2697 REGEX_CHECK_STATUS;
57a6839d 2698
729e4ab9
A
2699 UnicodeString result;
2700 UText resultText = UTEXT_INITIALIZER;
2701 utext_openUnicodeString(&resultText, &result, &status);
2702
2703 // Multiple finds do NOT bump up the previous appendReplacement postion.
2704 m.reset(&dataText);
2705 m.find();
2706 m.find();
2707 m.appendReplacement(&resultText, &replText, status);
2708 REGEX_CHECK_STATUS;
2709 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2710 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2711
2712 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2713 status = U_ZERO_ERROR;
2714 result.truncate(0);
2715 utext_openUnicodeString(&resultText, &result, &status);
2716 m.reset(10, status);
2717 m.find();
2718 m.find();
2719 m.appendReplacement(&resultText, &replText, status);
2720 REGEX_CHECK_STATUS;
2721 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2722 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2723
2724 // find() at interior of string, appendReplacement still starts at beginning.
2725 status = U_ZERO_ERROR;
2726 result.truncate(0);
2727 utext_openUnicodeString(&resultText, &result, &status);
2728 m.reset();
2729 m.find(10, status);
2730 m.find();
2731 m.appendReplacement(&resultText, &replText, status);
2732 REGEX_CHECK_STATUS;
2733 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2734 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2735
2736 m.appendTail(&resultText, status);
2737 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2738 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
57a6839d 2739
729e4ab9
A
2740 utext_close(&resultText);
2741 }
2742
2743 delete matcher2;
2744 delete pat2;
2745 delete matcher;
2746 delete pat;
57a6839d 2747
729e4ab9
A
2748 utext_close(&dataText);
2749 utext_close(&replText);
2750 utext_close(&destText);
2751 utext_close(&re);
2752}
2753
2754
2755//---------------------------------------------------------------------------
2756//
2757// API_Pattern_UTF8 Test that the API for class RegexPattern is
2758// present and nominally working.
2759//
2760//---------------------------------------------------------------------------
2761void RegexTest::API_Pattern_UTF8() {
2762 RegexPattern pata; // Test default constructor to not crash.
2763 RegexPattern patb;
2764
2765 REGEX_ASSERT(pata == patb);
2766 REGEX_ASSERT(pata == pata);
2767
2768 UText re1 = UTEXT_INITIALIZER;
2769 UText re2 = UTEXT_INITIALIZER;
2770 UErrorCode status = U_ZERO_ERROR;
2771 UParseError pe;
57a6839d 2772
729e4ab9
A
2773 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2774 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2775 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2776 utext_openUTF8(&re2, str_def, -1, &status);
2777
2778 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2779 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2780 REGEX_CHECK_STATUS;
2781 REGEX_ASSERT(*pat1 == *pat1);
2782 REGEX_ASSERT(*pat1 != pata);
2783
2784 // Assign
2785 patb = *pat1;
2786 REGEX_ASSERT(patb == *pat1);
2787
2788 // Copy Construct
2789 RegexPattern patc(*pat1);
2790 REGEX_ASSERT(patc == *pat1);
2791 REGEX_ASSERT(patb == patc);
2792 REGEX_ASSERT(pat1 != pat2);
2793 patb = *pat2;
2794 REGEX_ASSERT(patb != patc);
2795 REGEX_ASSERT(patb == *pat2);
2796
2797 // Compile with no flags.
2798 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2799 REGEX_ASSERT(*pat1a == *pat1);
2800
2801 REGEX_ASSERT(pat1a->flags() == 0);
2802
2803 // Compile with different flags should be not equal
2804 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2805 REGEX_CHECK_STATUS;
2806
2807 REGEX_ASSERT(*pat1b != *pat1a);
2808 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2809 REGEX_ASSERT(pat1a->flags() == 0);
2810 delete pat1b;
2811
2812 // clone
2813 RegexPattern *pat1c = pat1->clone();
2814 REGEX_ASSERT(*pat1c == *pat1);
2815 REGEX_ASSERT(*pat1c != *pat2);
2816
2817 delete pat1c;
2818 delete pat1a;
2819 delete pat1;
2820 delete pat2;
57a6839d 2821
729e4ab9
A
2822 utext_close(&re1);
2823 utext_close(&re2);
2824
2825
2826 //
2827 // Verify that a matcher created from a cloned pattern works.
2828 // (Jitterbug 3423)
2829 //
2830 {
2831 UErrorCode status = U_ZERO_ERROR;
2832 UText pattern = UTEXT_INITIALIZER;
2833 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2834 utext_openUTF8(&pattern, str_pL, -1, &status);
57a6839d 2835
729e4ab9
A
2836 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2837 RegexPattern *pClone = pSource->clone();
2838 delete pSource;
2839 RegexMatcher *mFromClone = pClone->matcher(status);
2840 REGEX_CHECK_STATUS;
57a6839d 2841
729e4ab9
A
2842 UText input = UTEXT_INITIALIZER;
2843 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2844 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2845 mFromClone->reset(&input);
2846 REGEX_ASSERT(mFromClone->find() == TRUE);
2847 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2848 REGEX_ASSERT(mFromClone->find() == TRUE);
2849 REGEX_ASSERT(mFromClone->group(status) == "World");
2850 REGEX_ASSERT(mFromClone->find() == FALSE);
2851 delete mFromClone;
2852 delete pClone;
57a6839d 2853
729e4ab9
A
2854 utext_close(&input);
2855 utext_close(&pattern);
2856 }
2857
2858 //
2859 // matches convenience API
2860 //
2861 {
2862 UErrorCode status = U_ZERO_ERROR;
2863 UText pattern = UTEXT_INITIALIZER;
2864 UText input = UTEXT_INITIALIZER;
57a6839d 2865
729e4ab9
A
2866 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2867 utext_openUTF8(&input, str_randominput, -1, &status);
2868
2869 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2870 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2871 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2872 REGEX_CHECK_STATUS;
57a6839d 2873
729e4ab9
A
2874 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2875 utext_openUTF8(&pattern, str_abc, -1, &status);
2876 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2877 REGEX_CHECK_STATUS;
57a6839d 2878
729e4ab9
A
2879 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2880 utext_openUTF8(&pattern, str_nput, -1, &status);
2881 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2882 REGEX_CHECK_STATUS;
57a6839d 2883
729e4ab9
A
2884 utext_openUTF8(&pattern, str_randominput, -1, &status);
2885 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2886 REGEX_CHECK_STATUS;
2887
2888 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2889 utext_openUTF8(&pattern, str_u, -1, &status);
2890 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2891 REGEX_CHECK_STATUS;
57a6839d 2892
729e4ab9
A
2893 utext_openUTF8(&input, str_abc, -1, &status);
2894 utext_openUTF8(&pattern, str_abc, -1, &status);
2895 status = U_INDEX_OUTOFBOUNDS_ERROR;
2896 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2897 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
57a6839d 2898
729e4ab9
A
2899 utext_close(&input);
2900 utext_close(&pattern);
2901 }
2902
2903
2904 //
2905 // Split()
2906 //
2907 status = U_ZERO_ERROR;
2908 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2909 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2910 pat1 = RegexPattern::compile(&re1, pe, status);
2911 REGEX_CHECK_STATUS;
2912 UnicodeString fields[10];
2913
2914 int32_t n;
2915 n = pat1->split("Now is the time", fields, 10, status);
2916 REGEX_CHECK_STATUS;
2917 REGEX_ASSERT(n==4);
2918 REGEX_ASSERT(fields[0]=="Now");
2919 REGEX_ASSERT(fields[1]=="is");
2920 REGEX_ASSERT(fields[2]=="the");
2921 REGEX_ASSERT(fields[3]=="time");
2922 REGEX_ASSERT(fields[4]=="");
2923
2924 n = pat1->split("Now is the time", fields, 2, status);
2925 REGEX_CHECK_STATUS;
2926 REGEX_ASSERT(n==2);
2927 REGEX_ASSERT(fields[0]=="Now");
2928 REGEX_ASSERT(fields[1]=="is the time");
2929 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2930
2931 fields[1] = "*";
2932 status = U_ZERO_ERROR;
2933 n = pat1->split("Now is the time", fields, 1, status);
2934 REGEX_CHECK_STATUS;
2935 REGEX_ASSERT(n==1);
2936 REGEX_ASSERT(fields[0]=="Now is the time");
2937 REGEX_ASSERT(fields[1]=="*");
2938 status = U_ZERO_ERROR;
2939
2940 n = pat1->split(" Now is the time ", fields, 10, status);
2941 REGEX_CHECK_STATUS;
4388f060 2942 REGEX_ASSERT(n==6);
729e4ab9
A
2943 REGEX_ASSERT(fields[0]=="");
2944 REGEX_ASSERT(fields[1]=="Now");
2945 REGEX_ASSERT(fields[2]=="is");
2946 REGEX_ASSERT(fields[3]=="the");
2947 REGEX_ASSERT(fields[4]=="time");
2948 REGEX_ASSERT(fields[5]=="");
4388f060 2949 REGEX_ASSERT(fields[6]=="");
729e4ab9 2950
4388f060 2951 fields[2] = "*";
729e4ab9
A
2952 n = pat1->split(" ", fields, 10, status);
2953 REGEX_CHECK_STATUS;
4388f060 2954 REGEX_ASSERT(n==2);
729e4ab9 2955 REGEX_ASSERT(fields[0]=="");
4388f060
A
2956 REGEX_ASSERT(fields[1]=="");
2957 REGEX_ASSERT(fields[2]=="*");
729e4ab9
A
2958
2959 fields[0] = "foo";
2960 n = pat1->split("", fields, 10, status);
2961 REGEX_CHECK_STATUS;
2962 REGEX_ASSERT(n==0);
2963 REGEX_ASSERT(fields[0]=="foo");
2964
2965 delete pat1;
2966
2967 // split, with a pattern with (capture)
2968 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2969 pat1 = RegexPattern::compile(&re1, pe, status);
2970 REGEX_CHECK_STATUS;
2971
2972 status = U_ZERO_ERROR;
4388f060 2973 fields[6] = fields[7] = "*";
729e4ab9
A
2974 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2975 REGEX_CHECK_STATUS;
4388f060 2976 REGEX_ASSERT(n==7);
729e4ab9
A
2977 REGEX_ASSERT(fields[0]=="");
2978 REGEX_ASSERT(fields[1]=="a");
2979 REGEX_ASSERT(fields[2]=="Now is ");
2980 REGEX_ASSERT(fields[3]=="b");
2981 REGEX_ASSERT(fields[4]=="the time");
2982 REGEX_ASSERT(fields[5]=="c");
2983 REGEX_ASSERT(fields[6]=="");
4388f060 2984 REGEX_ASSERT(fields[7]=="*");
729e4ab9
A
2985 REGEX_ASSERT(status==U_ZERO_ERROR);
2986
4388f060 2987 fields[6] = fields[7] = "*";
729e4ab9
A
2988 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2989 REGEX_CHECK_STATUS;
4388f060 2990 REGEX_ASSERT(n==7);
729e4ab9
A
2991 REGEX_ASSERT(fields[0]==" ");
2992 REGEX_ASSERT(fields[1]=="a");
2993 REGEX_ASSERT(fields[2]=="Now is ");
2994 REGEX_ASSERT(fields[3]=="b");
2995 REGEX_ASSERT(fields[4]=="the time");
2996 REGEX_ASSERT(fields[5]=="c");
2997 REGEX_ASSERT(fields[6]=="");
4388f060 2998 REGEX_ASSERT(fields[7]=="*");
729e4ab9
A
2999
3000 status = U_ZERO_ERROR;
3001 fields[6] = "foo";
4388f060 3002 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
729e4ab9
A
3003 REGEX_CHECK_STATUS;
3004 REGEX_ASSERT(n==6);
3005 REGEX_ASSERT(fields[0]==" ");
3006 REGEX_ASSERT(fields[1]=="a");
3007 REGEX_ASSERT(fields[2]=="Now is ");
3008 REGEX_ASSERT(fields[3]=="b");
3009 REGEX_ASSERT(fields[4]=="the time");
4388f060 3010 REGEX_ASSERT(fields[5]==" ");
729e4ab9
A
3011 REGEX_ASSERT(fields[6]=="foo");
3012
3013 status = U_ZERO_ERROR;
3014 fields[5] = "foo";
3015 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3016 REGEX_CHECK_STATUS;
3017 REGEX_ASSERT(n==5);
3018 REGEX_ASSERT(fields[0]==" ");
3019 REGEX_ASSERT(fields[1]=="a");
3020 REGEX_ASSERT(fields[2]=="Now is ");
3021 REGEX_ASSERT(fields[3]=="b");
3022 REGEX_ASSERT(fields[4]=="the time<c>");
3023 REGEX_ASSERT(fields[5]=="foo");
3024
3025 status = U_ZERO_ERROR;
3026 fields[5] = "foo";
3027 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3028 REGEX_CHECK_STATUS;
3029 REGEX_ASSERT(n==5);
3030 REGEX_ASSERT(fields[0]==" ");
3031 REGEX_ASSERT(fields[1]=="a");
3032 REGEX_ASSERT(fields[2]=="Now is ");
3033 REGEX_ASSERT(fields[3]=="b");
3034 REGEX_ASSERT(fields[4]=="the time");
3035 REGEX_ASSERT(fields[5]=="foo");
3036
3037 status = U_ZERO_ERROR;
3038 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3039 REGEX_CHECK_STATUS;
3040 REGEX_ASSERT(n==4);
3041 REGEX_ASSERT(fields[0]==" ");
3042 REGEX_ASSERT(fields[1]=="a");
3043 REGEX_ASSERT(fields[2]=="Now is ");
3044 REGEX_ASSERT(fields[3]=="the time<c>");
3045 status = U_ZERO_ERROR;
3046 delete pat1;
3047
3048 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3049 pat1 = RegexPattern::compile(&re1, pe, status);
3050 REGEX_CHECK_STATUS;
3051 n = pat1->split("1-10,20", fields, 10, status);
3052 REGEX_CHECK_STATUS;
3053 REGEX_ASSERT(n==5);
3054 REGEX_ASSERT(fields[0]=="1");
3055 REGEX_ASSERT(fields[1]=="-");
3056 REGEX_ASSERT(fields[2]=="10");
3057 REGEX_ASSERT(fields[3]==",");
3058 REGEX_ASSERT(fields[4]=="20");
3059 delete pat1;
3060
3061
3062 //
3063 // RegexPattern::pattern() and patternText()
3064 //
3065 pat1 = new RegexPattern();
3066 REGEX_ASSERT(pat1->pattern() == "");
3067 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3068 delete pat1;
4388f060
A
3069 const char *helloWorldInvariant = "(Hello, world)*";
3070 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
729e4ab9
A
3071 pat1 = RegexPattern::compile(&re1, pe, status);
3072 REGEX_CHECK_STATUS;
4388f060 3073 REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
729e4ab9
A
3074 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3075 delete pat1;
3076
3077 utext_close(&re1);
3078}
3079
3080
3081//---------------------------------------------------------------------------
3082//
3083// Extended A more thorough check for features of regex patterns
3084// The test cases are in a separate data file,
3085// source/tests/testdata/regextst.txt
3086// A description of the test data format is included in that file.
3087//
3088//---------------------------------------------------------------------------
3089
3090const char *
3091RegexTest::getPath(char buffer[2048], const char *filename) {
3092 UErrorCode status=U_ZERO_ERROR;
3093 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3094 if (U_FAILURE(status)) {
3095 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3096 return NULL;
3097 }
3098
3099 strcpy(buffer, testDataDirectory);
3100 strcat(buffer, filename);
3101 return buffer;
3102}
3103
3104void RegexTest::Extended() {
3105 char tdd[2048];
3106 const char *srcPath;
3107 UErrorCode status = U_ZERO_ERROR;
3108 int32_t lineNum = 0;
3109
3110 //
3111 // Open and read the test data file.
3112 //
3113 srcPath=getPath(tdd, "regextst.txt");
3114 if(srcPath==NULL) {
3115 return; /* something went wrong, error already output */
3116 }
3117
3118 int32_t len;
3119 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3120 if (U_FAILURE(status)) {
3121 return; /* something went wrong, error already output */
3122 }
3123
3124 //
3125 // Put the test data into a UnicodeString
3126 //
3127 UnicodeString testString(FALSE, testData, len);
3128
3129 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3130 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
4388f060 3131 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
729e4ab9
A
3132
3133 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3134 UnicodeString testPattern; // The pattern for test from the test file.
3135 UnicodeString testFlags; // the flags for a test.
3136 UnicodeString matchString; // The marked up string to be used as input
3137
3138 if (U_FAILURE(status)){
57a6839d 3139 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
729e4ab9
A
3140 delete [] testData;
3141 return;
3142 }
3143
3144 //
3145 // Loop over the test data file, once per line.
3146 //
3147 while (lineMat.find()) {
3148 lineNum++;
3149 if (U_FAILURE(status)) {
3150 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3151 }
3152
3153 status = U_ZERO_ERROR;
3154 UnicodeString testLine = lineMat.group(1, status);
3155 if (testLine.length() == 0) {
3156 continue;
3157 }
3158
3159 //
3160 // Parse the test line. Skip blank and comment only lines.
3161 // Separate out the three main fields - pattern, flags, target.
3162 //
3163
3164 commentMat.reset(testLine);
3165 if (commentMat.lookingAt(status)) {
3166 // This line is a comment, or blank.
3167 continue;
3168 }
3169
3170 //
3171 // Pull out the pattern field, remove it from the test file line.
3172 //
3173 quotedStuffMat.reset(testLine);
3174 if (quotedStuffMat.lookingAt(status)) {
3175 testPattern = quotedStuffMat.group(2, status);
3176 testLine.remove(0, quotedStuffMat.end(0, status));
3177 } else {
3178 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3179 continue;
3180 }
3181
3182
3183 //
3184 // Pull out the flags from the test file line.
3185 //
3186 flagsMat.reset(testLine);
3187 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3188 testFlags = flagsMat.group(1, status);
3189 if (flagsMat.group(2, status).length() > 0) {
3190 errln("Bad Match flag at line %d. Scanning %c\n",
b75a7d8f
A
3191 lineNum, flagsMat.group(2, status).charAt(0));
3192 continue;
3193 }
729e4ab9
A
3194 testLine.remove(0, flagsMat.end(0, status));
3195
3196 //
3197 // Pull out the match string, as a whole.
3198 // We'll process the <tags> later.
3199 //
3200 quotedStuffMat.reset(testLine);
3201 if (quotedStuffMat.lookingAt(status)) {
3202 matchString = quotedStuffMat.group(2, status);
3203 testLine.remove(0, quotedStuffMat.end(0, status));
3204 } else {
3205 errln("Bad match string at test file line %d", lineNum);
3206 continue;
3207 }
3208
3209 //
3210 // The only thing left from the input line should be an optional trailing comment.
3211 //
3212 commentMat.reset(testLine);
3213 if (commentMat.lookingAt(status) == FALSE) {
3214 errln("Line %d: unexpected characters at end of test line.", lineNum);
3215 continue;
3216 }
3217
3218 //
3219 // Run the test
3220 //
3221 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3222 }
3223
3224 delete [] testData;
3225
3226}
3227
3228
3229
3230//---------------------------------------------------------------------------
3231//
3232// regex_find(pattern, flags, inputString, lineNumber)
3233//
3234// Function to run a single test from the Extended (data driven) tests.
3235// See file test/testdata/regextst.txt for a description of the
3236// pattern and inputString fields, and the allowed flags.
3237// lineNumber is the source line in regextst.txt of the test.
3238//
3239//---------------------------------------------------------------------------
3240
3241
3242// Set a value into a UVector at position specified by a decimal number in
3243// a UnicodeString. This is a utility function needed by the actual test function,
3244// which follows.
3245static void set(UVector &vec, int32_t val, UnicodeString index) {
3246 UErrorCode status=U_ZERO_ERROR;
3247 int32_t idx = 0;
3248 for (int32_t i=0; i<index.length(); i++) {
3249 int32_t d=u_charDigitValue(index.charAt(i));
3250 if (d<0) {return;}
3251 idx = idx*10 + d;
3252 }
3253 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3254 vec.setElementAt(val, idx);
3255}
3256
3257static void setInt(UVector &vec, int32_t val, int32_t idx) {
3258 UErrorCode status=U_ZERO_ERROR;
3259 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3260 vec.setElementAt(val, idx);
3261}
3262
3263static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3264{
3265 UBool couldFind = TRUE;
3266 UTEXT_SETNATIVEINDEX(utext, 0);
3267 int32_t i = 0;
3268 while (i < unistrOffset) {
3269 UChar32 c = UTEXT_NEXT32(utext);
3270 if (c != U_SENTINEL) {
3271 i += U16_LENGTH(c);
3272 } else {
3273 couldFind = FALSE;
3274 break;
3275 }
3276 }
4388f060 3277 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
729e4ab9
A
3278 return couldFind;
3279}
3280
3281
3282void RegexTest::regex_find(const UnicodeString &pattern,
3283 const UnicodeString &flags,
3284 const UnicodeString &inputString,
3285 const char *srcPath,
3286 int32_t line) {
3287 UnicodeString unEscapedInput;
3288 UnicodeString deTaggedInput;
57a6839d 3289
729e4ab9
A
3290 int32_t patternUTF8Length, inputUTF8Length;
3291 char *patternChars = NULL, *inputChars = NULL;
3292 UText patternText = UTEXT_INITIALIZER;
3293 UText inputText = UTEXT_INITIALIZER;
3294 UConverter *UTF8Converter = NULL;
3295
3296 UErrorCode status = U_ZERO_ERROR;
3297 UParseError pe;
3298 RegexPattern *parsePat = NULL;
3299 RegexMatcher *parseMatcher = NULL;
3300 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3301 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3302 UVector groupStarts(status);
3303 UVector groupEnds(status);
3304 UVector groupStartsUTF8(status);
3305 UVector groupEndsUTF8(status);
3306 UBool isMatch = FALSE, isUTF8Match = FALSE;
3307 UBool failed = FALSE;
3308 int32_t numFinds;
3309 int32_t i;
3310 UBool useMatchesFunc = FALSE;
3311 UBool useLookingAtFunc = FALSE;
3312 int32_t regionStart = -1;
3313 int32_t regionEnd = -1;
3314 int32_t regionStartUTF8 = -1;
3315 int32_t regionEndUTF8 = -1;
57a6839d 3316
729e4ab9
A
3317
3318 //
3319 // Compile the caller's pattern
3320 //
3321 uint32_t bflags = 0;
3322 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3323 bflags |= UREGEX_CASE_INSENSITIVE;
3324 }
3325 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3326 bflags |= UREGEX_COMMENTS;
3327 }
3328 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3329 bflags |= UREGEX_DOTALL;
3330 }
3331 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3332 bflags |= UREGEX_MULTILINE;
3333 }
57a6839d 3334
729e4ab9
A
3335 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3336 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3337 }
3338 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3339 bflags |= UREGEX_UNIX_LINES;
3340 }
4388f060
A
3341 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3342 bflags |= UREGEX_LITERAL;
3343 }
729e4ab9
A
3344
3345
3346 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3347 if (status != U_ZERO_ERROR) {
3348 #if UCONFIG_NO_BREAK_ITERATION==1
3349 // 'v' test flag means that the test pattern should not compile if ICU was configured
3350 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3351 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3352 goto cleanupAndReturn;
3353 }
3354 #endif
3355 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3356 // Expected pattern compilation error.
3357 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3358 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3359 }
3360 goto cleanupAndReturn;
3361 } else {
3362 // Unexpected pattern compilation error.
4388f060 3363 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
729e4ab9
A
3364 goto cleanupAndReturn;
3365 }
3366 }
3367
3368 UTF8Converter = ucnv_open("UTF8", &status);
3369 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
57a6839d 3370
729e4ab9
A
3371 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3372 status = U_ZERO_ERROR; // buffer overflow
3373 patternChars = new char[patternUTF8Length+1];
3374 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3375 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
57a6839d 3376
729e4ab9
A
3377 if (status == U_ZERO_ERROR) {
3378 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
57a6839d 3379
729e4ab9
A
3380 if (status != U_ZERO_ERROR) {
3381#if UCONFIG_NO_BREAK_ITERATION==1
3382 // 'v' test flag means that the test pattern should not compile if ICU was configured
3383 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3384 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3385 goto cleanupAndReturn;
3386 }
3387#endif
3388 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3389 // Expected pattern compilation error.
3390 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3391 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3392 }
3393 goto cleanupAndReturn;
3394 } else {
3395 // Unexpected pattern compilation error.
3396 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3397 goto cleanupAndReturn;
3398 }
3399 }
3400 }
57a6839d 3401
729e4ab9
A
3402 if (UTF8Pattern == NULL) {
3403 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3404 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3405 status = U_ZERO_ERROR;
3406 }
3407
3408 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
57a6839d 3409 callerPattern->dumpPattern();
729e4ab9
A
3410 }
3411
3412 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3413 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3414 goto cleanupAndReturn;
3415 }
3416
3417
3418 //
3419 // Number of times find() should be called on the test string, default to 1
3420 //
3421 numFinds = 1;
3422 for (i=2; i<=9; i++) {
3423 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3424 if (numFinds != 1) {
3425 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3426 goto cleanupAndReturn;
3427 }
3428 numFinds = i;
3429 }
3430 }
57a6839d 3431
729e4ab9
A
3432 // 'M' flag. Use matches() instead of find()
3433 if (flags.indexOf((UChar)0x4d) >= 0) {
3434 useMatchesFunc = TRUE;
3435 }
3436 if (flags.indexOf((UChar)0x4c) >= 0) {
3437 useLookingAtFunc = TRUE;
3438 }
3439
3440 //
3441 // Find the tags in the input data, remove them, and record the group boundary
3442 // positions.
3443 //
3444 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3445 REGEX_CHECK_STATUS_L(line);
3446
3447 unEscapedInput = inputString.unescape();
3448 parseMatcher = parsePat->matcher(unEscapedInput, status);
3449 REGEX_CHECK_STATUS_L(line);
3450 while(parseMatcher->find()) {
3451 parseMatcher->appendReplacement(deTaggedInput, "", status);
3452 REGEX_CHECK_STATUS;
3453 UnicodeString groupNum = parseMatcher->group(2, status);
3454 if (groupNum == "r") {
3455 // <r> or </r>, a region specification within the string
3456 if (parseMatcher->group(1, status) == "/") {
3457 regionEnd = deTaggedInput.length();
3458 } else {
3459 regionStart = deTaggedInput.length();
3460 }
3461 } else {
3462 // <digits> or </digits>, a group match boundary tag.
3463 if (parseMatcher->group(1, status) == "/") {
3464 set(groupEnds, deTaggedInput.length(), groupNum);
3465 } else {
3466 set(groupStarts, deTaggedInput.length(), groupNum);
3467 }
3468 }
3469 }
3470 parseMatcher->appendTail(deTaggedInput);
3471 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3472 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3473 errln("mismatched <r> tags");
3474 failed = TRUE;
3475 goto cleanupAndReturn;
3476 }
b75a7d8f 3477
729e4ab9
A
3478 //
3479 // Configure the matcher according to the flags specified with this test.
3480 //
3481 matcher = callerPattern->matcher(deTaggedInput, status);
3482 REGEX_CHECK_STATUS_L(line);
3483 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3484 matcher->setTrace(TRUE);
3485 }
57a6839d 3486
729e4ab9
A
3487 if (UTF8Pattern != NULL) {
3488 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3489 status = U_ZERO_ERROR; // buffer overflow
3490 inputChars = new char[inputUTF8Length+1];
3491 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3492 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3493
3494 if (status == U_ZERO_ERROR) {
4388f060 3495 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
729e4ab9
A
3496 REGEX_CHECK_STATUS_L(line);
3497 }
57a6839d 3498
729e4ab9
A
3499 if (UTF8Matcher == NULL) {
3500 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3501 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3502 status = U_ZERO_ERROR;
3503 }
3504 }
3505
3506 //
3507 // Generate native indices for UTF8 versions of region and capture group info
3508 //
3509 if (UTF8Matcher != NULL) {
3510 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3511 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
57a6839d 3512
729e4ab9
A
3513 // Fill out the native index UVector info.
3514 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3515 for (i=0; i<groupStarts.size(); i++) {
3516 int32_t start = groupStarts.elementAti(i);
3517 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3518 if (start >= 0) {
3519 int32_t startUTF8;
3520 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3521 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3522 failed = TRUE;
3523 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3524 }
3525 setInt(groupStartsUTF8, startUTF8, i);
3526 }
57a6839d 3527
729e4ab9
A
3528 int32_t end = groupEnds.elementAti(i);
3529 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3530 if (end >= 0) {
3531 int32_t endUTF8;
3532 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3533 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3534 failed = TRUE;
3535 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3536 }
3537 setInt(groupEndsUTF8, endUTF8, i);
3538 }
3539 }
3540 }
3541
3542 if (regionStart>=0) {
3543 matcher->region(regionStart, regionEnd, status);
3544 REGEX_CHECK_STATUS_L(line);
3545 if (UTF8Matcher != NULL) {
3546 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3547 REGEX_CHECK_STATUS_L(line);
3548 }
3549 }
3550 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3551 matcher->useAnchoringBounds(FALSE);
3552 if (UTF8Matcher != NULL) {
3553 UTF8Matcher->useAnchoringBounds(FALSE);
3554 }
3555 }
3556 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3557 matcher->useTransparentBounds(TRUE);
3558 if (UTF8Matcher != NULL) {
3559 UTF8Matcher->useTransparentBounds(TRUE);
3560 }
3561 }
57a6839d
A
3562
3563
729e4ab9
A
3564
3565 //
3566 // Do a find on the de-tagged input using the caller's pattern
3567 // TODO: error on count>1 and not find().
3568 // error on both matches() and lookingAt().
3569 //
3570 for (i=0; i<numFinds; i++) {
3571 if (useMatchesFunc) {
3572 isMatch = matcher->matches(status);
3573 if (UTF8Matcher != NULL) {
3574 isUTF8Match = UTF8Matcher->matches(status);
3575 }
3576 } else if (useLookingAtFunc) {
3577 isMatch = matcher->lookingAt(status);
3578 if (UTF8Matcher != NULL) {
3579 isUTF8Match = UTF8Matcher->lookingAt(status);
3580 }
b75a7d8f 3581 } else {
729e4ab9
A
3582 isMatch = matcher->find();
3583 if (UTF8Matcher != NULL) {
3584 isUTF8Match = UTF8Matcher->find();
3585 }
b75a7d8f 3586 }
729e4ab9
A
3587 }
3588 matcher->setTrace(FALSE);
57a6839d
A
3589 if (U_FAILURE(status)) {
3590 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3591 }
b75a7d8f 3592
729e4ab9
A
3593 //
3594 // Match up the groups from the find() with the groups from the tags
3595 //
3596
3597 // number of tags should match number of groups from find operation.
3598 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3599 // G option in test means that capture group data is not available in the
3600 // expected results, so the check needs to be suppressed.
3601 if (isMatch == FALSE && groupStarts.size() != 0) {
4388f060 3602 dataerrln("Error at line %d: Match expected, but none found.", line);
729e4ab9
A
3603 failed = TRUE;
3604 goto cleanupAndReturn;
3605 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3606 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3607 failed = TRUE;
3608 goto cleanupAndReturn;
3609 }
3610
3611 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3612 // Only check for match / no match. Don't check capture groups.
3613 if (isMatch && groupStarts.size() == 0) {
3614 errln("Error at line %d: No match expected, but one found.", line);
3615 failed = TRUE;
3616 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3617 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3618 failed = TRUE;
3619 }
3620 goto cleanupAndReturn;
3621 }
3622
3623 REGEX_CHECK_STATUS_L(line);
3624 for (i=0; i<=matcher->groupCount(); i++) {
3625 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3626 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3627 if (matcher->start(i, status) != expectedStart) {
3628 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3629 line, i, expectedStart, matcher->start(i, status));
3630 failed = TRUE;
3631 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3632 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3633 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3634 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3635 failed = TRUE;
3636 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3637 }
57a6839d 3638
729e4ab9
A
3639 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3640 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3641 if (matcher->end(i, status) != expectedEnd) {
3642 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3643 line, i, expectedEnd, matcher->end(i, status));
3644 failed = TRUE;
3645 // Error on end position; keep going; real error is probably yet to come as group
3646 // end positions work from end of the input data towards the front.
3647 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3648 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3649 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3650 failed = TRUE;
3651 // Error on end position; keep going; real error is probably yet to come as group
3652 // end positions work from end of the input data towards the front.
3653 }
3654 }
3655 if ( matcher->groupCount()+1 < groupStarts.size()) {
3656 errln("Error at line %d: Expected %d capture groups, found %d.",
3657 line, groupStarts.size()-1, matcher->groupCount());
3658 failed = TRUE;
b75a7d8f 3659 }
729e4ab9
A
3660 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3661 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3662 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3663 failed = TRUE;
3664 }
3665
3666 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3667 matcher->requireEnd() == TRUE) {
3668 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3669 failed = TRUE;
3670 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3671 UTF8Matcher->requireEnd() == TRUE) {
3672 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3673 failed = TRUE;
3674 }
57a6839d 3675
729e4ab9
A
3676 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3677 matcher->requireEnd() == FALSE) {
3678 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3679 failed = TRUE;
3680 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3681 UTF8Matcher->requireEnd() == FALSE) {
3682 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3683 failed = TRUE;
3684 }
57a6839d 3685
729e4ab9
A
3686 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3687 matcher->hitEnd() == TRUE) {
3688 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3689 failed = TRUE;
3690 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3691 UTF8Matcher->hitEnd() == TRUE) {
3692 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3693 failed = TRUE;
3694 }
57a6839d 3695
729e4ab9
A
3696 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3697 matcher->hitEnd() == FALSE) {
3698 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3699 failed = TRUE;
3700 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3701 UTF8Matcher->hitEnd() == FALSE) {
3702 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3703 failed = TRUE;
3704 }
3705
3706
3707cleanupAndReturn:
3708 if (failed) {
3709 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3710 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3711 // callerPattern->dump();
3712 }
3713 delete parseMatcher;
3714 delete parsePat;
3715 delete UTF8Matcher;
3716 delete UTF8Pattern;
3717 delete matcher;
3718 delete callerPattern;
57a6839d 3719
729e4ab9
A
3720 utext_close(&inputText);
3721 delete[] inputChars;
3722 utext_close(&patternText);
3723 delete[] patternChars;
3724 ucnv_close(UTF8Converter);
3725}
3726
3727
3728
3729
3730//---------------------------------------------------------------------------
3731//
3732// Errors Check for error handling in patterns.
3733//
3734//---------------------------------------------------------------------------
3735void RegexTest::Errors() {
3736 // \escape sequences that aren't implemented yet.
3737 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3738
3739 // Missing close parentheses
3740 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3741 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3742 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3743
3744 // Extra close paren
3745 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3746 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3747 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3748
3749 // Look-ahead, Look-behind
3750 // TODO: add tests for unbounded length look-behinds.
3751 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3752
3753 // Attempt to use non-default flags
3754 {
3755 UParseError pe;
3756 UErrorCode status = U_ZERO_ERROR;
3757 int32_t flags = UREGEX_CANON_EQ |
3758 UREGEX_COMMENTS | UREGEX_DOTALL |
3759 UREGEX_MULTILINE;
3760 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3761 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3762 delete pat1;
3763 }
3764
3765
3766 // Quantifiers are allowed only after something that can be quantified.
3767 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3768 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3769 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3770
3771 // Mal-formed {min,max} quantifiers
3772 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3773 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3774 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3775 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3776 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3777 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3778 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3779 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3780 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
b75a7d8f 3781
729e4ab9
A
3782 // Ticket 5389
3783 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
b75a7d8f 3784
729e4ab9
A
3785 // Invalid Back Reference \0
3786 // For ICU 3.8 and earlier
3787 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3788 //
3789 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
b75a7d8f
A
3790
3791}
3792
3793
729e4ab9 3794//-------------------------------------------------------------------------------
57a6839d 3795//
729e4ab9
A
3796// Read a text data file, convert it to UChars, and return the data
3797// in one big UChar * buffer, which the caller must delete.
46f4442e 3798//
729e4ab9
A
3799//--------------------------------------------------------------------------------
3800UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3801 const char *defEncoding, UErrorCode &status) {
3802 UChar *retPtr = NULL;
3803 char *fileBuf = NULL;
3804 UConverter* conv = NULL;
3805 FILE *f = NULL;
46f4442e 3806
729e4ab9
A
3807 ulen = 0;
3808 if (U_FAILURE(status)) {
3809 return retPtr;
46f4442e 3810 }
46f4442e
A
3811
3812 //
729e4ab9 3813 // Open the file.
46f4442e 3814 //
729e4ab9
A
3815 f = fopen(fileName, "rb");
3816 if (f == 0) {
3817 dataerrln("Error opening test data file %s\n", fileName);
3818 status = U_FILE_ACCESS_ERROR;
3819 return NULL;
46f4442e 3820 }
729e4ab9
A
3821 //
3822 // Read it in
3823 //
3824 int32_t fileSize;
3825 int32_t amt_read;
3826
3827 fseek( f, 0, SEEK_END);
3828 fileSize = ftell(f);
3829 fileBuf = new char[fileSize];
3830 fseek(f, 0, SEEK_SET);
3831 amt_read = fread(fileBuf, 1, fileSize, f);
3832 if (amt_read != fileSize || fileSize <= 0) {
3833 errln("Error reading test data file.");
3834 goto cleanUpAndReturn;
46f4442e
A
3835 }
3836
729e4ab9
A
3837 //
3838 // Look for a Unicode Signature (BOM) on the data just read
3839 //
3840 int32_t signatureLength;
3841 const char * fileBufC;
3842 const char* encoding;
46f4442e 3843
729e4ab9
A
3844 fileBufC = fileBuf;
3845 encoding = ucnv_detectUnicodeSignature(
3846 fileBuf, fileSize, &signatureLength, &status);
3847 if(encoding!=NULL ){
3848 fileBufC += signatureLength;
3849 fileSize -= signatureLength;
3850 } else {
3851 encoding = defEncoding;
3852 if (strcmp(encoding, "utf-8") == 0) {
3853 errln("file %s is missing its BOM", fileName);
46f4442e
A
3854 }
3855 }
3856
729e4ab9
A
3857 //
3858 // Open a converter to take the rule file to UTF-16
3859 //
3860 conv = ucnv_open(encoding, &status);
3861 if (U_FAILURE(status)) {
3862 goto cleanUpAndReturn;
46f4442e
A
3863 }
3864
729e4ab9
A
3865 //
3866 // Convert the rules to UChar.
3867 // Preflight first to determine required buffer size.
3868 //
3869 ulen = ucnv_toUChars(conv,
3870 NULL, // dest,
3871 0, // destCapacity,
3872 fileBufC,
3873 fileSize,
3874 &status);
3875 if (status == U_BUFFER_OVERFLOW_ERROR) {
3876 // Buffer Overflow is expected from the preflight operation.
3877 status = U_ZERO_ERROR;
3878
3879 retPtr = new UChar[ulen+1];
3880 ucnv_toUChars(conv,
3881 retPtr, // dest,
3882 ulen+1,
3883 fileBufC,
3884 fileSize,
3885 &status);
46f4442e
A
3886 }
3887
729e4ab9
A
3888cleanUpAndReturn:
3889 fclose(f);
3890 delete[] fileBuf;
3891 ucnv_close(conv);
3892 if (U_FAILURE(status)) {
3893 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4388f060 3894 delete []retPtr;
729e4ab9
A
3895 retPtr = 0;
3896 ulen = 0;
3897 };
3898 return retPtr;
3899}
3900
3901
3902//-------------------------------------------------------------------------------
3903//
3904// PerlTests - Run Perl's regular expression tests
3905// The input file for this test is re_tests, the standard regular
3906// expression test data distributed with the Perl source code.
3907//
3908// Here is Perl's description of the test data file:
3909//
3910// # The tests are in a separate file 't/op/re_tests'.
3911// # Each line in that file is a separate test.
3912// # There are five columns, separated by tabs.
3913// #
3914// # Column 1 contains the pattern, optionally enclosed in C<''>.
3915// # Modifiers can be put after the closing C<'>.
3916// #
3917// # Column 2 contains the string to be matched.
3918// #
3919// # Column 3 contains the expected result:
3920// # y expect a match
3921// # n expect no match
3922// # c expect an error
3923// # B test exposes a known bug in Perl, should be skipped
3924// # b test exposes a known bug in Perl, should be skipped if noamp
3925// #
3926// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3927// #
3928// # Column 4 contains a string, usually C<$&>.
3929// #
3930// # Column 5 contains the expected result of double-quote
3931// # interpolating that string after the match, or start of error message.
3932// #
3933// # Column 6, if present, contains a reason why the test is skipped.
3934// # This is printed with "skipped", for harness to pick up.
3935// #
3936// # \n in the tests are interpolated, as are variables of the form ${\w+}.
3937// #
3938// # If you want to add a regular expression test that can't be expressed
3939// # in this format, don't add it here: put it in op/pat.t instead.
3940//
3941// For ICU, if field 3 contains an 'i', the test will be skipped.
3942// The test exposes is some known incompatibility between ICU and Perl regexps.
3943// (The i is in addition to whatever was there before.)
3944//
3945//-------------------------------------------------------------------------------
3946void RegexTest::PerlTests() {
3947 char tdd[2048];
3948 const char *srcPath;
3949 UErrorCode status = U_ZERO_ERROR;
3950 UParseError pe;
46f4442e
A
3951
3952 //
729e4ab9 3953 // Open and read the test data file.
46f4442e 3954 //
729e4ab9
A
3955 srcPath=getPath(tdd, "re_tests.txt");
3956 if(srcPath==NULL) {
3957 return; /* something went wrong, error already output */
46f4442e 3958 }
729e4ab9
A
3959
3960 int32_t len;
3961 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3962 if (U_FAILURE(status)) {
3963 return; /* something went wrong, error already output */
46f4442e
A
3964 }
3965
3966 //
729e4ab9 3967 // Put the test data into a UnicodeString
46f4442e 3968 //
729e4ab9 3969 UnicodeString testDataString(FALSE, testData, len);
46f4442e 3970
729e4ab9
A
3971 //
3972 // Regex to break the input file into lines, and strip the new lines.
3973 // One line per match, capture group one is the desired data.
3974 //
3975 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3976 if (U_FAILURE(status)) {
3977 dataerrln("RegexPattern::compile() error");
3978 return;
46f4442e 3979 }
729e4ab9 3980 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
46f4442e 3981
729e4ab9
A
3982 //
3983 // Regex to split a test file line into fields.
3984 // There are six fields, separated by tabs.
3985 //
3986 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
46f4442e
A
3987
3988 //
729e4ab9
A
3989 // Regex to identify test patterns with flag settings, and to separate them.
3990 // Test patterns with flags look like 'pattern'i
3991 // Test patterns without flags are not quoted: pattern
3992 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
46f4442e 3993 //
729e4ab9
A
3994 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3995 RegexMatcher* flagMat = flagPat->matcher(status);
46f4442e
A
3996
3997 //
729e4ab9
A
3998 // The Perl tests reference several perl-isms, which are evaluated/substituted
3999 // in the test data. Not being perl, this must be done explicitly. Here
4000 // are string constants and REs for these constructs.
46f4442e 4001 //
729e4ab9
A
4002 UnicodeString nulnulSrc("${nulnul}");
4003 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4004 nulnul = nulnul.unescape();
4005
4006 UnicodeString ffffSrc("${ffff}");
4007 UnicodeString ffff("\\uffff", -1, US_INV);
4008 ffff = ffff.unescape();
4009
4010 // regexp for $-[0], $+[2], etc.
4011 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4012 RegexMatcher *groupsMat = groupsPat->matcher(status);
4013
4014 // regexp for $0, $1, $2, etc.
4015 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4016 RegexMatcher *cgMat = cgPat->matcher(status);
4017
46f4442e
A
4018
4019 //
729e4ab9
A
4020 // Main Loop for the Perl Tests, runs once per line from the
4021 // test data file.
46f4442e 4022 //
729e4ab9
A
4023 int32_t lineNum = 0;
4024 int32_t skippedUnimplementedCount = 0;
4025 while (lineMat->find()) {
4026 lineNum++;
46f4442e 4027
729e4ab9
A
4028 //
4029 // Get a line, break it into its fields, do the Perl
4030 // variable substitutions.
4031 //
4032 UnicodeString line = lineMat->group(1, status);
4033 UnicodeString fields[7];
4034 fieldPat->split(line, fields, 7, status);
46f4442e 4035
729e4ab9
A
4036 flagMat->reset(fields[0]);
4037 flagMat->matches(status);
4038 UnicodeString pattern = flagMat->group(2, status);
4039 pattern.findAndReplace("${bang}", "!");
4040 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4041 pattern.findAndReplace(ffffSrc, ffff);
4042
4043 //
4044 // Identify patterns that include match flag settings,
4045 // split off the flags, remove the extra quotes.
4046 //
4047 UnicodeString flagStr = flagMat->group(3, status);
4048 if (U_FAILURE(status)) {
4049 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4050 return;
4051 }
4052 int32_t flags = 0;
4053 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4054 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4055 const UChar UChar_m = 0x6d;
4056 const UChar UChar_x = 0x78;
4057 const UChar UChar_y = 0x79;
4058 if (flagStr.indexOf(UChar_i) != -1) {
4059 flags |= UREGEX_CASE_INSENSITIVE;
4060 }
4061 if (flagStr.indexOf(UChar_m) != -1) {
4062 flags |= UREGEX_MULTILINE;
4063 }
4064 if (flagStr.indexOf(UChar_x) != -1) {
4065 flags |= UREGEX_COMMENTS;
46f4442e 4066 }
46f4442e 4067
729e4ab9
A
4068 //
4069 // Compile the test pattern.
4070 //
4071 status = U_ZERO_ERROR;
4072 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4073 if (status == U_REGEX_UNIMPLEMENTED) {
4074 //
4075 // Test of a feature that is planned for ICU, but not yet implemented.
4076 // skip the test.
4077 skippedUnimplementedCount++;
4078 delete testPat;
4079 status = U_ZERO_ERROR;
4080 continue;
46f4442e 4081 }
729e4ab9
A
4082
4083 if (U_FAILURE(status)) {
4084 // Some tests are supposed to generate errors.
4085 // Only report an error for tests that are supposed to succeed.
4086 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4087 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4088 {
4089 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4090 }
4091 status = U_ZERO_ERROR;
4092 delete testPat;
4093 continue;
46f4442e 4094 }
729e4ab9
A
4095
4096 if (fields[2].indexOf(UChar_i) >= 0) {
4097 // ICU should skip this test.
4098 delete testPat;
4099 continue;
46f4442e
A
4100 }
4101
729e4ab9
A
4102 if (fields[2].indexOf(UChar_c) >= 0) {
4103 // This pattern should have caused a compilation error, but didn't/
4104 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4105 delete testPat;
4106 continue;
4107 }
4108
4109 //
4110 // replace the Perl variables that appear in some of the
4111 // match data strings.
4112 //
4113 UnicodeString matchString = fields[1];
4114 matchString.findAndReplace(nulnulSrc, nulnul);
4115 matchString.findAndReplace(ffffSrc, ffff);
46f4442e 4116
729e4ab9
A
4117 // Replace any \n in the match string with an actual new-line char.
4118 // Don't do full unescape, as this unescapes more than Perl does, which
4119 // causes other spurious failures in the tests.
4120 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
46f4442e 4121
46f4442e
A
4122
4123
729e4ab9
A
4124 //
4125 // Run the test, check for expected match/don't match result.
4126 //
4127 RegexMatcher *testMat = testPat->matcher(matchString, status);
4128 UBool found = testMat->find();
4129 UBool expected = FALSE;
4130 if (fields[2].indexOf(UChar_y) >=0) {
4131 expected = TRUE;
4132 }
4133 if (expected != found) {
4134 errln("line %d: Expected %smatch, got %smatch",
4135 lineNum, expected?"":"no ", found?"":"no " );
4136 continue;
4137 }
57a6839d 4138
729e4ab9
A
4139 // Don't try to check expected results if there is no match.
4140 // (Some have stuff in the expected fields)
4141 if (!found) {
4142 delete testMat;
4143 delete testPat;
4144 continue;
4145 }
46f4442e 4146
729e4ab9
A
4147 //
4148 // Interpret the Perl expression from the fourth field of the data file,
4149 // building up an ICU string from the results of the ICU match.
4150 // The Perl expression will contain references to the results of
4151 // a regex match, including the matched string, capture group strings,
4152 // group starting and ending indicies, etc.
4153 //
4154 UnicodeString resultString;
4155 UnicodeString perlExpr = fields[3];
4156#if SUPPORT_MUTATING_INPUT_STRING
4157 groupsMat->reset(perlExpr);
4158 cgMat->reset(perlExpr);
4159#endif
46f4442e 4160
729e4ab9
A
4161 while (perlExpr.length() > 0) {
4162#if !SUPPORT_MUTATING_INPUT_STRING
4163 // Perferred usage. Reset after any modification to input string.
4164 groupsMat->reset(perlExpr);
4165 cgMat->reset(perlExpr);
4166#endif
b75a7d8f 4167
729e4ab9
A
4168 if (perlExpr.startsWith("$&")) {
4169 resultString.append(testMat->group(status));
4170 perlExpr.remove(0, 2);
4171 }
b75a7d8f 4172
729e4ab9
A
4173 else if (groupsMat->lookingAt(status)) {
4174 // $-[0] $+[2] etc.
4175 UnicodeString digitString = groupsMat->group(2, status);
4176 int32_t t = 0;
4177 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4178 UnicodeString plusOrMinus = groupsMat->group(1, status);
4179 int32_t matchPosition;
4180 if (plusOrMinus.compare("+") == 0) {
4181 matchPosition = testMat->end(groupNum, status);
4182 } else {
4183 matchPosition = testMat->start(groupNum, status);
4184 }
4185 if (matchPosition != -1) {
4186 ICU_Utility::appendNumber(resultString, matchPosition);
4187 }
4188 perlExpr.remove(0, groupsMat->end(status));
4189 }
b75a7d8f 4190
729e4ab9
A
4191 else if (cgMat->lookingAt(status)) {
4192 // $1, $2, $3, etc.
4193 UnicodeString digitString = cgMat->group(1, status);
4194 int32_t t = 0;
4195 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4196 if (U_SUCCESS(status)) {
4197 resultString.append(testMat->group(groupNum, status));
4198 status = U_ZERO_ERROR;
4199 }
4200 perlExpr.remove(0, cgMat->end(status));
4201 }
b75a7d8f 4202
729e4ab9
A
4203 else if (perlExpr.startsWith("@-")) {
4204 int32_t i;
4205 for (i=0; i<=testMat->groupCount(); i++) {
4206 if (i>0) {
4207 resultString.append(" ");
4208 }
4209 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4210 }
4211 perlExpr.remove(0, 2);
4212 }
b75a7d8f 4213
729e4ab9
A
4214 else if (perlExpr.startsWith("@+")) {
4215 int32_t i;
4216 for (i=0; i<=testMat->groupCount(); i++) {
4217 if (i>0) {
4218 resultString.append(" ");
4219 }
4220 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4221 }
4222 perlExpr.remove(0, 2);
4223 }
b75a7d8f 4224
729e4ab9
A
4225 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4226 // or as an escaped sequence (e.g. \n)
4227 if (perlExpr.length() > 1) {
4228 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4229 }
4230 UChar c = perlExpr.charAt(0);
4231 switch (c) {
4232 case 'n': c = '\n'; break;
4233 // add any other escape sequences that show up in the test expected results.
4234 }
4235 resultString.append(c);
4236 perlExpr.remove(0, 1);
4237 }
b75a7d8f 4238
729e4ab9
A
4239 else {
4240 // Any characters from the perl expression that we don't explicitly
4241 // recognize before here are assumed to be literals and copied
4242 // as-is to the expected results.
4243 resultString.append(perlExpr.charAt(0));
4244 perlExpr.remove(0, 1);
4245 }
374ca955 4246
729e4ab9
A
4247 if (U_FAILURE(status)) {
4248 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4249 break;
4250 }
4251 }
b75a7d8f 4252
729e4ab9
A
4253 //
4254 // Expected Results Compare
4255 //
4256 UnicodeString expectedS(fields[4]);
4257 expectedS.findAndReplace(nulnulSrc, nulnul);
4258 expectedS.findAndReplace(ffffSrc, ffff);
4259 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4260
4261
729e4ab9
A
4262 if (expectedS.compare(resultString) != 0) {
4263 err("Line %d: Incorrect perl expression results.", lineNum);
4264 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4265 }
374ca955 4266
729e4ab9
A
4267 delete testMat;
4268 delete testPat;
b75a7d8f 4269 }
374ca955 4270
b75a7d8f 4271 //
729e4ab9 4272 // All done. Clean up allocated stuff.
b75a7d8f 4273 //
729e4ab9
A
4274 delete cgMat;
4275 delete cgPat;
374ca955 4276
729e4ab9
A
4277 delete groupsMat;
4278 delete groupsPat;
374ca955 4279
729e4ab9
A
4280 delete flagMat;
4281 delete flagPat;
374ca955 4282
729e4ab9
A
4283 delete lineMat;
4284 delete linePat;
374ca955 4285
729e4ab9
A
4286 delete fieldPat;
4287 delete [] testData;
374ca955 4288
374ca955 4289
729e4ab9 4290 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
b75a7d8f 4291
b75a7d8f
A
4292}
4293
4294
4295//-------------------------------------------------------------------------------
4296//
729e4ab9
A
4297// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4298// (instead of using UnicodeStrings) to test the alternate engine.
4299// The input file for this test is re_tests, the standard regular
4300// expression test data distributed with the Perl source code.
4301// See PerlTests() for more information.
b75a7d8f
A
4302//
4303//-------------------------------------------------------------------------------
729e4ab9 4304void RegexTest::PerlTestsUTF8() {
374ca955
A
4305 char tdd[2048];
4306 const char *srcPath;
b75a7d8f
A
4307 UErrorCode status = U_ZERO_ERROR;
4308 UParseError pe;
729e4ab9
A
4309 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4310 UText patternText = UTEXT_INITIALIZER;
4311 char *patternChars = NULL;
4312 int32_t patternLength;
4313 int32_t patternCapacity = 0;
4314 UText inputText = UTEXT_INITIALIZER;
4315 char *inputChars = NULL;
4316 int32_t inputLength;
4317 int32_t inputCapacity = 0;
4318
4319 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
b75a7d8f
A
4320
4321 //
4322 // Open and read the test data file.
4323 //
374ca955
A
4324 srcPath=getPath(tdd, "re_tests.txt");
4325 if(srcPath==NULL) {
4326 return; /* something went wrong, error already output */
b75a7d8f
A
4327 }
4328
46f4442e
A
4329 int32_t len;
4330 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
374ca955
A
4331 if (U_FAILURE(status)) {
4332 return; /* something went wrong, error already output */
4333 }
b75a7d8f
A
4334
4335 //
4336 // Put the test data into a UnicodeString
4337 //
4338 UnicodeString testDataString(FALSE, testData, len);
4339
4340 //
4341 // Regex to break the input file into lines, and strip the new lines.
4342 // One line per match, capture group one is the desired data.
4343 //
46f4442e 4344 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
73c04bcf
A
4345 if (U_FAILURE(status)) {
4346 dataerrln("RegexPattern::compile() error");
4347 return;
4348 }
b75a7d8f
A
4349 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4350
4351 //
4352 // Regex to split a test file line into fields.
4353 // There are six fields, separated by tabs.
4354 //
46f4442e 4355 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
b75a7d8f
A
4356
4357 //
4358 // Regex to identify test patterns with flag settings, and to separate them.
4359 // Test patterns with flags look like 'pattern'i
4360 // Test patterns without flags are not quoted: pattern
4361 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4362 //
46f4442e 4363 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
374ca955 4364 RegexMatcher* flagMat = flagPat->matcher(status);
b75a7d8f
A
4365
4366 //
4367 // The Perl tests reference several perl-isms, which are evaluated/substituted
4368 // in the test data. Not being perl, this must be done explicitly. Here
4369 // are string constants and REs for these constructs.
4370 //
4371 UnicodeString nulnulSrc("${nulnul}");
46f4442e 4372 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
b75a7d8f
A
4373 nulnul = nulnul.unescape();
4374
4375 UnicodeString ffffSrc("${ffff}");
46f4442e 4376 UnicodeString ffff("\\uffff", -1, US_INV);
b75a7d8f
A
4377 ffff = ffff.unescape();
4378
4379 // regexp for $-[0], $+[2], etc.
46f4442e 4380 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
374ca955
A
4381 RegexMatcher *groupsMat = groupsPat->matcher(status);
4382
b75a7d8f 4383 // regexp for $0, $1, $2, etc.
46f4442e 4384 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
374ca955 4385 RegexMatcher *cgMat = cgPat->matcher(status);
b75a7d8f
A
4386
4387
4388 //
4389 // Main Loop for the Perl Tests, runs once per line from the
4390 // test data file.
4391 //
4392 int32_t lineNum = 0;
4393 int32_t skippedUnimplementedCount = 0;
4394 while (lineMat->find()) {
4395 lineNum++;
4396
4397 //
4398 // Get a line, break it into its fields, do the Perl
4399 // variable substitutions.
4400 //
4401 UnicodeString line = lineMat->group(1, status);
4402 UnicodeString fields[7];
4403 fieldPat->split(line, fields, 7, status);
4404
4405 flagMat->reset(fields[0]);
4406 flagMat->matches(status);
4407 UnicodeString pattern = flagMat->group(2, status);
4408 pattern.findAndReplace("${bang}", "!");
46f4442e 4409 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
b75a7d8f
A
4410 pattern.findAndReplace(ffffSrc, ffff);
4411
4412 //
4413 // Identify patterns that include match flag settings,
4414 // split off the flags, remove the extra quotes.
4415 //
4416 UnicodeString flagStr = flagMat->group(3, status);
4417 if (U_FAILURE(status)) {
4418 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4419 return;
4420 }
4421 int32_t flags = 0;
4422 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4423 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4424 const UChar UChar_m = 0x6d;
4425 const UChar UChar_x = 0x78;
4426 const UChar UChar_y = 0x79;
4427 if (flagStr.indexOf(UChar_i) != -1) {
4428 flags |= UREGEX_CASE_INSENSITIVE;
4429 }
4430 if (flagStr.indexOf(UChar_m) != -1) {
4431 flags |= UREGEX_MULTILINE;
4432 }
4433 if (flagStr.indexOf(UChar_x) != -1) {
4434 flags |= UREGEX_COMMENTS;
4435 }
57a6839d 4436
729e4ab9
A
4437 //
4438 // Put the pattern in a UTF-8 UText
4439 //
4440 status = U_ZERO_ERROR;
4441 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4442 if (status == U_BUFFER_OVERFLOW_ERROR) {
4443 status = U_ZERO_ERROR;
4444 delete[] patternChars;
4445 patternCapacity = patternLength + 1;
4446 patternChars = new char[patternCapacity];
4447 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4448 }
4449 utext_openUTF8(&patternText, patternChars, patternLength, &status);
b75a7d8f
A
4450
4451 //
4452 // Compile the test pattern.
4453 //
729e4ab9 4454 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
b75a7d8f
A
4455 if (status == U_REGEX_UNIMPLEMENTED) {
4456 //
4457 // Test of a feature that is planned for ICU, but not yet implemented.
4458 // skip the test.
4459 skippedUnimplementedCount++;
4460 delete testPat;
4461 status = U_ZERO_ERROR;
4462 continue;
4463 }
4464
4465 if (U_FAILURE(status)) {
4466 // Some tests are supposed to generate errors.
4467 // Only report an error for tests that are supposed to succeed.
4468 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4469 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4470 {
4471 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4472 }
4473 status = U_ZERO_ERROR;
4474 delete testPat;
4475 continue;
4476 }
4477
4478 if (fields[2].indexOf(UChar_i) >= 0) {
4479 // ICU should skip this test.
4480 delete testPat;
4481 continue;
4482 }
4483
4484 if (fields[2].indexOf(UChar_c) >= 0) {
4485 // This pattern should have caused a compilation error, but didn't/
4486 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4487 delete testPat;
4488 continue;
4489 }
4490
729e4ab9 4491
b75a7d8f
A
4492 //
4493 // replace the Perl variables that appear in some of the
374ca955 4494 // match data strings.
b75a7d8f
A
4495 //
4496 UnicodeString matchString = fields[1];
4497 matchString.findAndReplace(nulnulSrc, nulnul);
4498 matchString.findAndReplace(ffffSrc, ffff);
4499
4500 // Replace any \n in the match string with an actual new-line char.
4501 // Don't do full unescape, as this unescapes more than Perl does, which
4502 // causes other spurious failures in the tests.
46f4442e 4503 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
374ca955 4504
729e4ab9
A
4505 //
4506 // Put the input in a UTF-8 UText
4507 //
4508 status = U_ZERO_ERROR;
4509 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4510 if (status == U_BUFFER_OVERFLOW_ERROR) {
4511 status = U_ZERO_ERROR;
4512 delete[] inputChars;
4513 inputCapacity = inputLength + 1;
4514 inputChars = new char[inputCapacity];
4515 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4516 }
4517 utext_openUTF8(&inputText, inputChars, inputLength, &status);
b75a7d8f
A
4518
4519 //
4520 // Run the test, check for expected match/don't match result.
4521 //
4388f060 4522 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
b75a7d8f
A
4523 UBool found = testMat->find();
4524 UBool expected = FALSE;
4525 if (fields[2].indexOf(UChar_y) >=0) {
4526 expected = TRUE;
4527 }
4528 if (expected != found) {
374ca955 4529 errln("line %d: Expected %smatch, got %smatch",
b75a7d8f
A
4530 lineNum, expected?"":"no ", found?"":"no " );
4531 continue;
4532 }
57a6839d 4533
46f4442e
A
4534 // Don't try to check expected results if there is no match.
4535 // (Some have stuff in the expected fields)
4536 if (!found) {
4537 delete testMat;
4538 delete testPat;
4539 continue;
4540 }
b75a7d8f
A
4541
4542 //
4543 // Interpret the Perl expression from the fourth field of the data file,
4544 // building up an ICU string from the results of the ICU match.
374ca955 4545 // The Perl expression will contain references to the results of
b75a7d8f
A
4546 // a regex match, including the matched string, capture group strings,
4547 // group starting and ending indicies, etc.
4548 //
4549 UnicodeString resultString;
4550 UnicodeString perlExpr = fields[3];
b75a7d8f
A
4551
4552 while (perlExpr.length() > 0) {
729e4ab9
A
4553 groupsMat->reset(perlExpr);
4554 cgMat->reset(perlExpr);
4555
b75a7d8f
A
4556 if (perlExpr.startsWith("$&")) {
4557 resultString.append(testMat->group(status));
4558 perlExpr.remove(0, 2);
4559 }
4560
4561 else if (groupsMat->lookingAt(status)) {
4562 // $-[0] $+[2] etc.
4563 UnicodeString digitString = groupsMat->group(2, status);
4564 int32_t t = 0;
4565 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4566 UnicodeString plusOrMinus = groupsMat->group(1, status);
4567 int32_t matchPosition;
4568 if (plusOrMinus.compare("+") == 0) {
4569 matchPosition = testMat->end(groupNum, status);
4570 } else {
4571 matchPosition = testMat->start(groupNum, status);
4572 }
4573 if (matchPosition != -1) {
4574 ICU_Utility::appendNumber(resultString, matchPosition);
4575 }
4576 perlExpr.remove(0, groupsMat->end(status));
4577 }
4578
4579 else if (cgMat->lookingAt(status)) {
4580 // $1, $2, $3, etc.
4581 UnicodeString digitString = cgMat->group(1, status);
4582 int32_t t = 0;
4583 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4584 if (U_SUCCESS(status)) {
4585 resultString.append(testMat->group(groupNum, status));
4586 status = U_ZERO_ERROR;
4587 }
4588 perlExpr.remove(0, cgMat->end(status));
4589 }
4590
4591 else if (perlExpr.startsWith("@-")) {
46f4442e 4592 int32_t i;
b75a7d8f
A
4593 for (i=0; i<=testMat->groupCount(); i++) {
4594 if (i>0) {
4595 resultString.append(" ");
4596 }
4597 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4598 }
4599 perlExpr.remove(0, 2);
4600 }
4601
4602 else if (perlExpr.startsWith("@+")) {
46f4442e 4603 int32_t i;
b75a7d8f
A
4604 for (i=0; i<=testMat->groupCount(); i++) {
4605 if (i>0) {
4606 resultString.append(" ");
4607 }
4608 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4609 }
4610 perlExpr.remove(0, 2);
4611 }
4612
46f4442e 4613 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
b75a7d8f
A
4614 // or as an escaped sequence (e.g. \n)
4615 if (perlExpr.length() > 1) {
4616 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4617 }
4618 UChar c = perlExpr.charAt(0);
4619 switch (c) {
4620 case 'n': c = '\n'; break;
4621 // add any other escape sequences that show up in the test expected results.
4622 }
374ca955 4623 resultString.append(c);
b75a7d8f
A
4624 perlExpr.remove(0, 1);
4625 }
4626
4627 else {
4628 // Any characters from the perl expression that we don't explicitly
4629 // recognize before here are assumed to be literals and copied
4630 // as-is to the expected results.
4631 resultString.append(perlExpr.charAt(0));
4632 perlExpr.remove(0, 1);
4633 }
4634
4635 if (U_FAILURE(status)) {
4636 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4637 break;
4638 }
4639 }
374ca955 4640
b75a7d8f
A
4641 //
4642 // Expected Results Compare
4643 //
4644 UnicodeString expectedS(fields[4]);
4645 expectedS.findAndReplace(nulnulSrc, nulnul);
4646 expectedS.findAndReplace(ffffSrc, ffff);
46f4442e 4647 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4648
4649
4650 if (expectedS.compare(resultString) != 0) {
73c04bcf 4651 err("Line %d: Incorrect perl expression results.", lineNum);
729e4ab9 4652 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
b75a7d8f
A
4653 }
4654
4655 delete testMat;
4656 delete testPat;
4657 }
4658
4659 //
4660 // All done. Clean up allocated stuff.
4661 //
4662 delete cgMat;
4663 delete cgPat;
374ca955 4664
b75a7d8f
A
4665 delete groupsMat;
4666 delete groupsPat;
374ca955 4667
b75a7d8f
A
4668 delete flagMat;
4669 delete flagPat;
4670
4671 delete lineMat;
4672 delete linePat;
374ca955 4673
b75a7d8f
A
4674 delete fieldPat;
4675 delete [] testData;
57a6839d 4676
729e4ab9
A
4677 utext_close(&patternText);
4678 utext_close(&inputText);
57a6839d 4679
729e4ab9
A
4680 delete [] patternChars;
4681 delete [] inputChars;
374ca955 4682
b75a7d8f
A
4683
4684 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4685
4686}
4687
4688
729e4ab9
A
4689//--------------------------------------------------------------
4690//
4691// Bug6149 Verify limits to heap expansion for backtrack stack.
4692// Use this pattern,
57a6839d
A
4693// "(a?){1,8000000}"
4694// Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4695// This test is likely to be fragile, as further optimizations stop
4696// more cases of pointless looping in the match engine.
729e4ab9
A
4697//
4698//---------------------------------------------------------------
4699void RegexTest::Bug6149() {
57a6839d 4700 UnicodeString pattern("(a?){1,8000000}");
729e4ab9
A
4701 UnicodeString s("xyz");
4702 uint32_t flags = 0;
4703 UErrorCode status = U_ZERO_ERROR;
4704
4705 RegexMatcher matcher(pattern, s, flags, status);
4706 UBool result = false;
4707 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4708 REGEX_ASSERT(result == FALSE);
4709 }
4710
4711
46f4442e
A
4712//
4713// Callbacks() Test the callback function.
4714// When set, callbacks occur periodically during matching operations,
4715// giving the application code the ability to abort the operation
4716// before it's normal completion.
4717//
4718
4719struct callBackContext {
4720 RegexTest *test;
4721 int32_t maxCalls;
4722 int32_t numCalls;
4723 int32_t lastSteps;
4724 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4725};
4726
4727U_CDECL_BEGIN
4728static UBool U_CALLCONV
4729testCallBackFn(const void *context, int32_t steps) {
4730 callBackContext *info = (callBackContext *)context;
4731 if (info->lastSteps+1 != steps) {
4732 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4733 }
4734 info->lastSteps = steps;
4735 info->numCalls++;
4736 return (info->numCalls < info->maxCalls);
4737}
4738U_CDECL_END
4739
4740void RegexTest::Callbacks() {
4741 {
4742 // Getter returns NULLs if no callback has been set
57a6839d 4743
46f4442e
A
4744 // The variables that the getter will fill in.
4745 // Init to non-null values so that the action of the getter can be seen.
4746 const void *returnedContext = &returnedContext;
4747 URegexMatchCallback *returnedFn = &testCallBackFn;
57a6839d 4748
46f4442e
A
4749 UErrorCode status = U_ZERO_ERROR;
4750 RegexMatcher matcher("x", 0, status);
4751 REGEX_CHECK_STATUS;
4752 matcher.getMatchCallback(returnedFn, returnedContext, status);
4753 REGEX_CHECK_STATUS;
4754 REGEX_ASSERT(returnedFn == NULL);
4755 REGEX_ASSERT(returnedContext == NULL);
4756 }
57a6839d 4757
46f4442e
A
4758 {
4759 // Set and Get work
4760 callBackContext cbInfo = {this, 0, 0, 0};
4761 const void *returnedContext;
4762 URegexMatchCallback *returnedFn;
4763 UErrorCode status = U_ZERO_ERROR;
4764 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4765 REGEX_CHECK_STATUS;
4766 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4767 REGEX_CHECK_STATUS;
4768 matcher.getMatchCallback(returnedFn, returnedContext, status);
4769 REGEX_CHECK_STATUS;
4770 REGEX_ASSERT(returnedFn == testCallBackFn);
4771 REGEX_ASSERT(returnedContext == &cbInfo);
57a6839d 4772
46f4442e
A
4773 // A short-running match shouldn't invoke the callback
4774 status = U_ZERO_ERROR;
4775 cbInfo.reset(1);
4776 UnicodeString s = "xxx";
4777 matcher.reset(s);
4778 REGEX_ASSERT(matcher.matches(status));
4779 REGEX_CHECK_STATUS;
4780 REGEX_ASSERT(cbInfo.numCalls == 0);
57a6839d 4781
46f4442e
A
4782 // A medium-length match that runs long enough to invoke the
4783 // callback, but not so long that the callback aborts it.
4784 status = U_ZERO_ERROR;
4785 cbInfo.reset(4);
4786 s = "aaaaaaaaaaaaaaaaaaab";
4787 matcher.reset(s);
4788 REGEX_ASSERT(matcher.matches(status)==FALSE);
4789 REGEX_CHECK_STATUS;
4790 REGEX_ASSERT(cbInfo.numCalls > 0);
57a6839d 4791
46f4442e
A
4792 // A longer running match that the callback function will abort.
4793 status = U_ZERO_ERROR;
4794 cbInfo.reset(4);
4795 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4796 matcher.reset(s);
4797 REGEX_ASSERT(matcher.matches(status)==FALSE);
4798 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4799 REGEX_ASSERT(cbInfo.numCalls == 4);
4800 }
57a6839d 4801
46f4442e
A
4802
4803}
b75a7d8f 4804
729e4ab9
A
4805
4806//
4807// FindProgressCallbacks() Test the find "progress" callback function.
4808// When set, the find progress callback will be invoked during a find operations
4809// after each return from a match attempt, giving the application the opportunity
4810// to terminate a long-running find operation before it's normal completion.
4811//
4812
4813struct progressCallBackContext {
4814 RegexTest *test;
4815 int64_t lastIndex;
4816 int32_t maxCalls;
4817 int32_t numCalls;
4818 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4819};
4820
4821U_CDECL_BEGIN
4822static UBool U_CALLCONV
4823testProgressCallBackFn(const void *context, int64_t matchIndex) {
4824 progressCallBackContext *info = (progressCallBackContext *)context;
4825 info->numCalls++;
4826 info->lastIndex = matchIndex;
4827// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4828 return (info->numCalls < info->maxCalls);
4829}
4830U_CDECL_END
4831
4832void RegexTest::FindProgressCallbacks() {
4833 {
4834 // Getter returns NULLs if no callback has been set
57a6839d 4835
729e4ab9
A
4836 // The variables that the getter will fill in.
4837 // Init to non-null values so that the action of the getter can be seen.
4838 const void *returnedContext = &returnedContext;
4839 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
57a6839d 4840
729e4ab9
A
4841 UErrorCode status = U_ZERO_ERROR;
4842 RegexMatcher matcher("x", 0, status);
4843 REGEX_CHECK_STATUS;
4844 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4845 REGEX_CHECK_STATUS;
4846 REGEX_ASSERT(returnedFn == NULL);
4847 REGEX_ASSERT(returnedContext == NULL);
4848 }
57a6839d 4849
729e4ab9
A
4850 {
4851 // Set and Get work
4852 progressCallBackContext cbInfo = {this, 0, 0, 0};
4853 const void *returnedContext;
4854 URegexFindProgressCallback *returnedFn;
4855 UErrorCode status = U_ZERO_ERROR;
4856 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4857 REGEX_CHECK_STATUS;
4858 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4859 REGEX_CHECK_STATUS;
4860 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4861 REGEX_CHECK_STATUS;
4862 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4863 REGEX_ASSERT(returnedContext == &cbInfo);
57a6839d 4864
729e4ab9
A
4865 // A short-running match should NOT invoke the callback.
4866 status = U_ZERO_ERROR;
4867 cbInfo.reset(100);
4868 UnicodeString s = "abxxx";
4869 matcher.reset(s);
4870#if 0
4871 matcher.setTrace(TRUE);
4872#endif
4873 REGEX_ASSERT(matcher.find(0, status));
4874 REGEX_CHECK_STATUS;
4875 REGEX_ASSERT(cbInfo.numCalls == 0);
57a6839d 4876
729e4ab9
A
4877 // A medium running match that causes matcher.find() to invoke our callback for each index.
4878 status = U_ZERO_ERROR;
4879 s = "aaaaaaaaaaaaaaaaaaab";
4880 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4881 matcher.reset(s);
4882 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4883 REGEX_CHECK_STATUS;
4884 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
57a6839d 4885
729e4ab9
A
4886 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4887 status = U_ZERO_ERROR;
4888 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4889 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4890 matcher.reset(s1);
4891 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4892 REGEX_CHECK_STATUS;
4893 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4894
4895#if 0
4896 // Now a match that will succeed, but after an interruption
4897 status = U_ZERO_ERROR;
4898 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4899 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4900 matcher.reset(s2);
4901 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4902 REGEX_CHECK_STATUS;
4903 // Now retry the match from where left off
4904 cbInfo.maxCalls = 100; // No callback limit
4905 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4906 REGEX_CHECK_STATUS;
4907#endif
4908 }
57a6839d 4909
729e4ab9
A
4910
4911}
4912
4913
4914//---------------------------------------------------------------------------
4915//
4916// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4917// UTexts. The pure-C implementation of UText
4918// has no mutable backing stores, but we can
4919// use UnicodeString here to test the functionality.
4920//
4921//---------------------------------------------------------------------------
4922void RegexTest::PreAllocatedUTextCAPI () {
4923 UErrorCode status = U_ZERO_ERROR;
4924 URegularExpression *re;
4925 UText patternText = UTEXT_INITIALIZER;
4926 UnicodeString buffer;
4927 UText bufferText = UTEXT_INITIALIZER;
57a6839d 4928
729e4ab9
A
4929 utext_openUnicodeString(&bufferText, &buffer, &status);
4930
4931 /*
4932 * getText() and getUText()
4933 */
4934 {
4935 UText text1 = UTEXT_INITIALIZER;
4936 UText text2 = UTEXT_INITIALIZER;
4937 UChar text2Chars[20];
4938 UText *resultText;
4939
4940 status = U_ZERO_ERROR;
4941 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4942 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4943 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4944 utext_openUChars(&text2, text2Chars, -1, &status);
57a6839d 4945
729e4ab9
A
4946 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4947 re = uregex_openUText(&patternText, 0, NULL, &status);
4948
4949 /* First set a UText */
4950 uregex_setUText(re, &text1, &status);
4951 resultText = uregex_getUText(re, &bufferText, &status);
4952 REGEX_CHECK_STATUS;
4953 REGEX_ASSERT(resultText == &bufferText);
4954 utext_setNativeIndex(resultText, 0);
4955 utext_setNativeIndex(&text1, 0);
4388f060 4956 REGEX_ASSERT(testUTextEqual(resultText, &text1));
57a6839d 4957
729e4ab9
A
4958 resultText = uregex_getUText(re, &bufferText, &status);
4959 REGEX_CHECK_STATUS;
4960 REGEX_ASSERT(resultText == &bufferText);
4961 utext_setNativeIndex(resultText, 0);
4962 utext_setNativeIndex(&text1, 0);
4388f060 4963 REGEX_ASSERT(testUTextEqual(resultText, &text1));
729e4ab9
A
4964
4965 /* Then set a UChar * */
4966 uregex_setText(re, text2Chars, 7, &status);
4967 resultText = uregex_getUText(re, &bufferText, &status);
4968 REGEX_CHECK_STATUS;
4969 REGEX_ASSERT(resultText == &bufferText);
4970 utext_setNativeIndex(resultText, 0);
4971 utext_setNativeIndex(&text2, 0);
4388f060 4972 REGEX_ASSERT(testUTextEqual(resultText, &text2));
57a6839d 4973
729e4ab9
A
4974 uregex_close(re);
4975 utext_close(&text1);
4976 utext_close(&text2);
4977 }
4978
4979 /*
4980 * group()
4981 */
4982 {
4983 UChar text1[80];
4984 UText *actual;
4985 UBool result;
4986 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
4987
4988 status = U_ZERO_ERROR;
4989 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4990 REGEX_CHECK_STATUS;
4991
4992 uregex_setText(re, text1, -1, &status);
4993 result = uregex_find(re, 0, &status);
4994 REGEX_ASSERT(result==TRUE);
4995
4996 /* Capture Group 0, the full match. Should succeed. */
4997 status = U_ZERO_ERROR;
4998 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4999 REGEX_CHECK_STATUS;
5000 REGEX_ASSERT(actual == &bufferText);
5001 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
5002
5003 /* Capture group #1. Should succeed. */
5004 status = U_ZERO_ERROR;
5005 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
5006 REGEX_CHECK_STATUS;
5007 REGEX_ASSERT(actual == &bufferText);
5008 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5009
5010 /* Capture group out of range. Error. */
5011 status = U_ZERO_ERROR;
5012 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5013 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5014 REGEX_ASSERT(actual == &bufferText);
5015
5016 uregex_close(re);
5017
5018 }
57a6839d 5019
729e4ab9
A
5020 /*
5021 * replaceFirst()
5022 */
5023 {
5024 UChar text1[80];
5025 UChar text2[80];
5026 UText replText = UTEXT_INITIALIZER;
5027 UText *result;
57a6839d 5028
729e4ab9
A
5029 status = U_ZERO_ERROR;
5030 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5031 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5032 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5033
5034 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5035 REGEX_CHECK_STATUS;
5036
5037 /* Normal case, with match */
5038 uregex_setText(re, text1, -1, &status);
5039 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5040 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5041 REGEX_CHECK_STATUS;
5042 REGEX_ASSERT(result == &bufferText);
5043 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5044
5045 /* No match. Text should copy to output with no changes. */
5046 uregex_setText(re, text2, -1, &status);
5047 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5048 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5049 REGEX_CHECK_STATUS;
5050 REGEX_ASSERT(result == &bufferText);
5051 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
57a6839d 5052
729e4ab9
A
5053 /* Unicode escapes */
5054 uregex_setText(re, text1, -1, &status);
5055 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5056 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5057 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5058 REGEX_CHECK_STATUS;
5059 REGEX_ASSERT(result == &bufferText);
5060 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5061
5062 uregex_close(re);
5063 utext_close(&replText);
5064 }
5065
5066
5067 /*
5068 * replaceAll()
5069 */
5070 {
5071 UChar text1[80];
5072 UChar text2[80];
5073 UText replText = UTEXT_INITIALIZER;
5074 UText *result;
5075
5076 status = U_ZERO_ERROR;
5077 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5078 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5079 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5080
5081 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5082 REGEX_CHECK_STATUS;
5083
5084 /* Normal case, with match */
5085 uregex_setText(re, text1, -1, &status);
5086 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5087 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5088 REGEX_CHECK_STATUS;
5089 REGEX_ASSERT(result == &bufferText);
5090 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5091
5092 /* No match. Text should copy to output with no changes. */
5093 uregex_setText(re, text2, -1, &status);
5094 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5095 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5096 REGEX_CHECK_STATUS;
5097 REGEX_ASSERT(result == &bufferText);
5098 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5099
5100 uregex_close(re);
5101 utext_close(&replText);
5102 }
5103
5104
5105 /*
5106 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5107 * so we don't need to test it here.
5108 */
57a6839d 5109
729e4ab9
A
5110 utext_close(&bufferText);
5111 utext_close(&patternText);
5112}
5113
5114//--------------------------------------------------------------
5115//
5116// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5117//
5118//---------------------------------------------------------------
5119void RegexTest::Bug7651() {
5120 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5121 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5122 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5123 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5124 UnicodeString s("#ff @abcd This is test");
5125 RegexPattern *REPattern = NULL;
5126 RegexMatcher *REMatcher = NULL;
5127 UErrorCode status = U_ZERO_ERROR;
5128 UParseError pe;
5129
5130 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5131 REGEX_CHECK_STATUS;
5132 REMatcher = REPattern->matcher(s, status);
5133 REGEX_CHECK_STATUS;
5134 REGEX_ASSERT(REMatcher->find());
5135 REGEX_ASSERT(REMatcher->start(status) == 0);
5136 delete REPattern;
5137 delete REMatcher;
5138 status = U_ZERO_ERROR;
5139
5140 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5141 REGEX_CHECK_STATUS;
5142 REMatcher = REPattern->matcher(s, status);
5143 REGEX_CHECK_STATUS;
5144 REGEX_ASSERT(REMatcher->find());
5145 REGEX_ASSERT(REMatcher->start(status) == 0);
5146 delete REPattern;
5147 delete REMatcher;
5148 status = U_ZERO_ERROR;
5149 }
5150
5151void RegexTest::Bug7740() {
5152 UErrorCode status = U_ZERO_ERROR;
5153 UnicodeString pattern = "(a)";
5154 UnicodeString text = "abcdef";
5155 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5156 REGEX_CHECK_STATUS;
5157 REGEX_ASSERT(m->lookingAt(status));
5158 REGEX_CHECK_STATUS;
5159 status = U_ILLEGAL_ARGUMENT_ERROR;
5160 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5161 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5162 REGEX_ASSERT(s == "");
5163 delete m;
5164}
5165
4388f060
A
5166// Bug 8479: was crashing whith a Bogus UnicodeString as input.
5167
5168void RegexTest::Bug8479() {
5169 UErrorCode status = U_ZERO_ERROR;
729e4ab9 5170
4388f060
A
5171 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5172 REGEX_CHECK_STATUS;
5173 if (U_SUCCESS(status))
5174 {
5175 UnicodeString str;
5176 str.setToBogus();
5177 pMatcher->reset(str);
5178 status = U_ZERO_ERROR;
5179 pMatcher->matches(status);
5180 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5181 delete pMatcher;
5182 }
5183}
57a6839d 5184
729e4ab9 5185
4388f060
A
5186// Bug 7029
5187void RegexTest::Bug7029() {
5188 UErrorCode status = U_ZERO_ERROR;
5189
5190 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5191 UnicodeString text = "abc.def";
5192 UnicodeString splits[10];
5193 REGEX_CHECK_STATUS;
5194 int32_t numFields = pMatcher->split(text, splits, 10, status);
5195 REGEX_CHECK_STATUS;
5196 REGEX_ASSERT(numFields == 8);
5197 delete pMatcher;
5198}
5199
5200// Bug 9283
5201// This test is checking for the existance of any supplemental characters that case-fold
57a6839d 5202// to a bmp character.
4388f060 5203//
57a6839d
A
5204// At the time of this writing there are none. If any should appear in a subsequent release
5205// of Unicode, the code in regular expressions compilation that determines the longest
5206// posssible match for a literal string will need to be enhanced.
4388f060
A
5207//
5208// See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5209// for details on what to do in case of a failure of this test.
5210//
5211void RegexTest::Bug9283() {
57a6839d 5212#if !UCONFIG_NO_NORMALIZATION
4388f060
A
5213 UErrorCode status = U_ZERO_ERROR;
5214 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5215 REGEX_CHECK_STATUS;
5216 int32_t index;
5217 UChar32 c;
5218 for (index=0; ; index++) {
5219 c = supplementalsWithCaseFolding.charAt(index);
5220 if (c == -1) {
5221 break;
5222 }
5223 UnicodeString cf = UnicodeString(c).foldCase();
5224 REGEX_ASSERT(cf.length() >= 2);
5225 }
57a6839d 5226#endif /* #if !UCONFIG_NO_NORMALIZATION */
4388f060
A
5227}
5228
5229
5230void RegexTest::CheckInvBufSize() {
5231 if(inv_next>=INV_BUFSIZ) {
5232 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5233 __FILE__, INV_BUFSIZ, inv_next);
5234 } else {
5235 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5236 }
5237}
5238
57a6839d
A
5239
5240void RegexTest::Bug10459() {
5241 UErrorCode status = U_ZERO_ERROR;
5242 UnicodeString patternString("(txt)");
5243 UnicodeString txtString("txt");
5244
5245 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5246 REGEX_CHECK_STATUS;
5247 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5248 REGEX_CHECK_STATUS;
5249
5250 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5251 REGEX_CHECK_STATUS;
5252
5253 uregex_setUText(icu_re, utext_txt, &status);
5254 REGEX_CHECK_STATUS;
5255
5256 // The bug was that calling uregex_group() before doing a matching operation
5257 // was causing a segfault. Only for Regular Expressions created from UText.
5258 // It should set an U_REGEX_INVALID_STATE.
5259
5260 UChar buf[100];
5261 int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status);
5262 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5263 REGEX_ASSERT(len == 0);
5264
5265 uregex_close(icu_re);
5266 utext_close(utext_pat);
5267 utext_close(utext_txt);
5268}
5269
b75a7d8f
A
5270#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5271