]> git.saurik.com Git - apple/icu.git/blame - icuSources/test/intltest/regextst.cpp
ICU-461.18.tar.gz
[apple/icu.git] / icuSources / test / intltest / regextst.cpp
CommitLineData
b75a7d8f
A
1/********************************************************************
2 * COPYRIGHT:
729e4ab9 3 * Copyright (c) 2002-2010, International Business Machines Corporation and
b75a7d8f
A
4 * others. All Rights Reserved.
5 ********************************************************************/
6
7//
8// regextst.cpp
9//
10// ICU Regular Expressions test, part of intltest.
11//
12
374ca955 13#include "intltest.h"
b75a7d8f
A
14#if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
374ca955 16#include "unicode/regex.h"
b75a7d8f
A
17#include "unicode/uchar.h"
18#include "unicode/ucnv.h"
729e4ab9 19#include "unicode/ustring.h"
b75a7d8f
A
20#include "regextst.h"
21#include "uvector.h"
b75a7d8f 22#include "util.h"
374ca955 23#include <stdlib.h>
73c04bcf 24#include <string.h>
374ca955 25#include <stdio.h>
729e4ab9
A
26#include "cstring.h"
27#include "uinvchar.h"
b75a7d8f 28
729e4ab9 29#define SUPPORT_MUTATING_INPUT_STRING 0
b75a7d8f
A
30
31//---------------------------------------------------------------------------
32//
33// Test class boilerplate
34//
35//---------------------------------------------------------------------------
374ca955 36RegexTest::RegexTest()
b75a7d8f 37{
73c04bcf 38}
b75a7d8f
A
39
40
41RegexTest::~RegexTest()
42{
73c04bcf 43}
b75a7d8f
A
44
45
46
47void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
48{
49 if (exec) logln("TestSuite RegexTest: ");
50 switch (index) {
51
52 case 0: name = "Basic";
374ca955 53 if (exec) Basic();
b75a7d8f
A
54 break;
55 case 1: name = "API_Match";
374ca955 56 if (exec) API_Match();
b75a7d8f
A
57 break;
58 case 2: name = "API_Replace";
374ca955 59 if (exec) API_Replace();
b75a7d8f
A
60 break;
61 case 3: name = "API_Pattern";
374ca955 62 if (exec) API_Pattern();
b75a7d8f 63 break;
729e4ab9
A
64 case 4:
65#if !UCONFIG_NO_FILE_IO
66 name = "Extended";
374ca955 67 if (exec) Extended();
729e4ab9
A
68#else
69 name = "skip";
70#endif
b75a7d8f
A
71 break;
72 case 5: name = "Errors";
374ca955 73 if (exec) Errors();
b75a7d8f
A
74 break;
75 case 6: name = "PerlTests";
76 if (exec) PerlTests();
77 break;
46f4442e 78 case 7: name = "Callbacks";
729e4ab9
A
79 if (exec) Callbacks();
80 break;
81 case 8: name = "FindProgressCallbacks";
82 if (exec) FindProgressCallbacks();
83 break;
84 case 9: name = "Bug 6149";
85 if (exec) Bug6149();
86 break;
87 case 10: name = "UTextBasic";
88 if (exec) UTextBasic();
89 break;
90 case 11: name = "API_Match_UTF8";
91 if (exec) API_Match_UTF8();
92 break;
93 case 12: name = "API_Replace_UTF8";
94 if (exec) API_Replace_UTF8();
95 break;
96 case 13: name = "API_Pattern_UTF8";
97 if (exec) API_Pattern_UTF8();
98 break;
99 case 14: name = "PerlTestsUTF8";
100 if (exec) PerlTestsUTF8();
101 break;
102 case 15: name = "PreAllocatedUTextCAPI";
103 if (exec) PreAllocatedUTextCAPI();
46f4442e 104 break;
729e4ab9
A
105 case 16: name = "Bug 7651";
106 if (exec) Bug7651();
107 break;
108 case 17: name = "Bug 7740";
109 if (exec) Bug7740();
110 break;
b75a7d8f 111
374ca955 112 default: name = "";
b75a7d8f
A
113 break; //needed to end loop
114 }
115}
116
117
729e4ab9
A
118/**
119 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
120 * into ASCII.
121 * @see utext_openUTF8
122 */
123static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
124
125static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
126#if U_CHARSET_FAMILY==U_ASCII_FAMILY
127 return utext_openUTF8(ut, inv, length, status);
128#else
129 char buf[1024];
130
131 uprv_aestrncpy((uint8_t*)buf, (const uint8_t*)inv, length);
132
133 return utext_openUTF8(ut, buf, length, status);
134#endif
135}
136
b75a7d8f
A
137//---------------------------------------------------------------------------
138//
139// Error Checking / Reporting macros used in all of the tests.
140//
141//---------------------------------------------------------------------------
b75a7d8f 142
729e4ab9
A
143static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
144 int64_t oldIndex = utext_getNativeIndex(text);
145 utext_setNativeIndex(text, 0);
146 char *bufPtr = buf;
147 UChar32 c = utext_next32From(text, 0);
148 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
149 if (0x000020<=c && c<0x00007e) {
150 *bufPtr = c;
151 } else {
152#if 0
153 sprintf(bufPtr,"U+%04X", c);
154 bufPtr+= strlen(bufPtr)-1;
155#else
156 *bufPtr = '%';
157#endif
158 }
159 bufPtr++;
160 c = UTEXT_NEXT32(text);
161 }
162 *bufPtr = 0;
163#if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
164 char *ebuf = (char*)malloc(bufLen);
165 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
166 uprv_strncpy(buf, ebuf, bufLen);
167 free((void*)ebuf);
168#endif
169 utext_setNativeIndex(text, oldIndex);
170}
171
172#define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
173
174#define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
175 __FILE__, __LINE__, u_errorName(status)); return;}}
176
177#define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
b75a7d8f
A
178
179#define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
729e4ab9 180if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
b75a7d8f
A
181 __LINE__, u_errorName(errcode), u_errorName(status));};}
182
183#define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
184 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
185
186#define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
187 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
188
729e4ab9
A
189/**
190 * @param expected expected text in UTF-8 (not platform) codepage
191 */
192void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
193 UErrorCode status = U_ZERO_ERROR;
194 UText expectedText = UTEXT_INITIALIZER;
195 utext_openUTF8(&expectedText, expected, -1, &status);
196 if(U_FAILURE(status)) {
197 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
198 return;
199 }
200 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
201 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
202 return;
203 }
204 utext_setNativeIndex(actual, 0);
205 if (utext_compare(&expectedText, -1, actual, -1) != 0) {
206 char buf[201 /*21*/];
207 char expectedBuf[201];
208 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
209 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
210 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
211 }
212 utext_close(&expectedText);
213}
214/**
215 * @param expected invariant (platform local text) input
216 */
217
218void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
219 UErrorCode status = U_ZERO_ERROR;
220 UText expectedText = UTEXT_INITIALIZER;
221 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
222 if(U_FAILURE(status)) {
223 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
224 return;
225 }
226 utext_setNativeIndex(actual, 0);
227 if (utext_compare(&expectedText, -1, actual, -1) != 0) {
228 char buf[201 /*21*/];
229 char expectedBuf[201];
230 utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
231 utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
232 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
233 }
234 utext_close(&expectedText);
235}
236
237/**
238 * Assumes utf-8 input
239 */
240#define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
241/**
242 * Assumes Invariant input
243 */
244#define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
b75a7d8f
A
245
246
247//---------------------------------------------------------------------------
248//
249// REGEX_TESTLM Macro + invocation function to simplify writing quick tests
250// for the LookingAt() and Match() functions.
251//
252// usage:
253// REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
254//
255// The expected results are UBool - TRUE or FALSE.
256// The input text is unescaped. The pattern is not.
374ca955 257//
b75a7d8f
A
258//
259//---------------------------------------------------------------------------
260
729e4ab9 261#define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
b75a7d8f 262
46f4442e
A
263UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
264 const UnicodeString pattern(pat, -1, US_INV);
265 const UnicodeString inputText(text, -1, US_INV);
b75a7d8f
A
266 UErrorCode status = U_ZERO_ERROR;
267 UParseError pe;
268 RegexPattern *REPattern = NULL;
269 RegexMatcher *REMatcher = NULL;
270 UBool retVal = TRUE;
271
46f4442e 272 UnicodeString patString(pat, -1, US_INV);
b75a7d8f
A
273 REPattern = RegexPattern::compile(patString, 0, pe, status);
274 if (U_FAILURE(status)) {
729e4ab9 275 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
b75a7d8f
A
276 line, u_errorName(status));
277 return FALSE;
278 }
374ca955 279 if (line==376) { RegexPatternDump(REPattern);}
b75a7d8f
A
280
281 UnicodeString inputString(inputText);
282 UnicodeString unEscapedInput = inputString.unescape();
283 REMatcher = REPattern->matcher(unEscapedInput, status);
284 if (U_FAILURE(status)) {
285 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
286 line, u_errorName(status));
287 return FALSE;
288 }
374ca955 289
b75a7d8f
A
290 UBool actualmatch;
291 actualmatch = REMatcher->lookingAt(status);
292 if (U_FAILURE(status)) {
293 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
294 line, u_errorName(status));
295 retVal = FALSE;
296 }
297 if (actualmatch != looking) {
298 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
299 retVal = FALSE;
300 }
301
302 status = U_ZERO_ERROR;
303 actualmatch = REMatcher->matches(status);
304 if (U_FAILURE(status)) {
305 errln("RegexTest failure in matches() at line %d. Status = %s\n",
306 line, u_errorName(status));
307 retVal = FALSE;
308 }
309 if (actualmatch != match) {
310 errln("RegexTest: wrong return from matches() at line %d.\n", line);
311 retVal = FALSE;
312 }
313
314 if (retVal == FALSE) {
374ca955 315 RegexPatternDump(REPattern);
b75a7d8f
A
316 }
317
318 delete REPattern;
319 delete REMatcher;
320 return retVal;
321}
374ca955 322
b75a7d8f 323
729e4ab9
A
324UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
325 UText pattern = UTEXT_INITIALIZER;
326 int32_t inputUTF8Length;
327 char *textChars = NULL;
328 UText inputText = UTEXT_INITIALIZER;
329 UErrorCode status = U_ZERO_ERROR;
330 UParseError pe;
331 RegexPattern *REPattern = NULL;
332 RegexMatcher *REMatcher = NULL;
333 UBool retVal = TRUE;
334
335 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
336 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
337 if (U_FAILURE(status)) {
338 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
339 line, u_errorName(status));
340 return FALSE;
341 }
342
343 UnicodeString inputString(text, -1, US_INV);
344 UnicodeString unEscapedInput = inputString.unescape();
345 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
346 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
347
348 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
349 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
350 // UTF-8 does not allow unpaired surrogates, so this could actually happen
351 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
352 return TRUE; // not a failure of the Regex engine
353 }
354 status = U_ZERO_ERROR; // buffer overflow
355 textChars = new char[inputUTF8Length+1];
356 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
357 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
358
359 REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
360 if (U_FAILURE(status)) {
361 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
362 line, u_errorName(status));
363 return FALSE;
364 }
365
366 UBool actualmatch;
367 actualmatch = REMatcher->lookingAt(status);
368 if (U_FAILURE(status)) {
369 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
370 line, u_errorName(status));
371 retVal = FALSE;
372 }
373 if (actualmatch != looking) {
374 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
375 retVal = FALSE;
376 }
377
378 status = U_ZERO_ERROR;
379 actualmatch = REMatcher->matches(status);
380 if (U_FAILURE(status)) {
381 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
382 line, u_errorName(status));
383 retVal = FALSE;
384 }
385 if (actualmatch != match) {
386 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
387 retVal = FALSE;
388 }
389
390 if (retVal == FALSE) {
391 RegexPatternDump(REPattern);
392 }
393
394 delete REPattern;
395 delete REMatcher;
396 utext_close(&inputText);
397 utext_close(&pattern);
398 delete[] textChars;
399 return retVal;
400}
b75a7d8f
A
401
402
b75a7d8f
A
403
404//---------------------------------------------------------------------------
405//
406// REGEX_ERR Macro + invocation function to simplify writing tests
407// regex tests for incorrect patterns
408//
409// usage:
410// REGEX_ERR("pattern", expected error line, column, expected status);
411//
412//---------------------------------------------------------------------------
413#define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
414
415void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
46f4442e 416 UErrorCode expectedStatus, int32_t line) {
b75a7d8f
A
417 UnicodeString pattern(pat);
418
419 UErrorCode status = U_ZERO_ERROR;
420 UParseError pe;
421 RegexPattern *callerPattern = NULL;
422
423 //
424 // Compile the caller's pattern
425 //
426 UnicodeString patString(pat);
427 callerPattern = RegexPattern::compile(patString, 0, pe, status);
428 if (status != expectedStatus) {
729e4ab9 429 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
b75a7d8f
A
430 } else {
431 if (status != U_ZERO_ERROR) {
432 if (pe.line != errLine || pe.offset != errCol) {
433 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
434 line, errLine, errCol, pe.line, pe.offset);
435 }
436 }
437 }
438
439 delete callerPattern;
729e4ab9
A
440
441 //
442 // Compile again, using a UTF-8-based UText
443 //
444 UText patternText = UTEXT_INITIALIZER;
445 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
446 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
447 if (status != expectedStatus) {
448 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
449 } else {
450 if (status != U_ZERO_ERROR) {
451 if (pe.line != errLine || pe.offset != errCol) {
452 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
453 line, errLine, errCol, pe.line, pe.offset);
454 }
455 }
456 }
457
458 delete callerPattern;
459 utext_close(&patternText);
b75a7d8f
A
460}
461
462
463
464//---------------------------------------------------------------------------
465//
466// Basic Check for basic functionality of regex pattern matching.
467// Avoid the use of REGEX_FIND test macro, which has
468// substantial dependencies on basic Regex functionality.
469//
470//---------------------------------------------------------------------------
471void RegexTest::Basic() {
472
473
474//
475// Debug - slide failing test cases early
476//
477#if 0
478 {
479 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
480 UParseError pe;
481 UErrorCode status = U_ZERO_ERROR;
482 RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
483 // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
484 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
485 }
486 exit(1);
487#endif
488
489
490 //
491 // Pattern with parentheses
492 //
493 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
494 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
495 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
496
497 //
498 // Patterns with *
499 //
500 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
501 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
502 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
503 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
504 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
505
506 REGEX_TESTLM("a*", "", TRUE, TRUE);
507 REGEX_TESTLM("a*", "b", TRUE, FALSE);
508
509
510 //
511 // Patterns with "."
512 //
513 REGEX_TESTLM(".", "abc", TRUE, FALSE);
514 REGEX_TESTLM("...", "abc", TRUE, TRUE);
515 REGEX_TESTLM("....", "abc", FALSE, FALSE);
516 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
517 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
518 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
519 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
520 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
521
522 //
523 // Patterns with * applied to chars at end of literal string
524 //
525 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
526 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
527
528 //
529 // Supplemental chars match as single chars, not a pair of surrogates.
530 //
531 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
532 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
533 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
534
535
536 //
537 // UnicodeSets in the pattern
538 //
539 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
540 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
541 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
542 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
543 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
544 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
545
546 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
547 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
548 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
549 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
550 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
551
552 //
553 // OR operator in patterns
554 //
555 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
556 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
557 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
558 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
559
560 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
561 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
562 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
563 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
564 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
565 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
566
567 //
568 // +
569 //
570 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
571 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
572 REGEX_TESTLM("b+", "", FALSE, FALSE);
573 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
574 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
575 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
576
577 //
578 // ?
579 //
580 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
581 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
582 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
583 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
584 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
585 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
586 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
587 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
588 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
589
590 //
591 // Escape sequences that become single literal chars, handled internally
592 // by ICU's Unescape.
593 //
374ca955 594
b75a7d8f
A
595 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
596 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
374ca955
A
597 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
598 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
b75a7d8f
A
599 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
600 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
601 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
602 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
374ca955
A
603 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
604 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
b75a7d8f
A
605
606 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
607 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
608
609 // Escape of special chars in patterns
374ca955 610 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
729e4ab9 611}
b75a7d8f
A
612
613
729e4ab9
A
614//---------------------------------------------------------------------------
615//
616// UTextBasic Check for quirks that are specific to the UText
617// implementation.
618//
619//---------------------------------------------------------------------------
620void RegexTest::UTextBasic() {
621 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
622 UErrorCode status = U_ZERO_ERROR;
623 UText pattern = UTEXT_INITIALIZER;
624 utext_openUTF8(&pattern, str_abc, -1, &status);
625 RegexMatcher matcher(&pattern, 0, status);
626 REGEX_CHECK_STATUS;
627
628 UText input = UTEXT_INITIALIZER;
629 utext_openUTF8(&input, str_abc, -1, &status);
630 REGEX_CHECK_STATUS;
631 matcher.reset(&input);
632 REGEX_CHECK_STATUS;
633 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
634
635 matcher.reset(matcher.inputText());
636 REGEX_CHECK_STATUS;
637 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
638
639 utext_close(&pattern);
640 utext_close(&input);
73c04bcf 641}
b75a7d8f
A
642
643
644//---------------------------------------------------------------------------
645//
374ca955 646// API_Match Test that the API for class RegexMatcher
b75a7d8f
A
647// is present and nominally working, but excluding functions
648// implementing replace operations.
649//
650//---------------------------------------------------------------------------
651void RegexTest::API_Match() {
652 UParseError pe;
653 UErrorCode status=U_ZERO_ERROR;
654 int32_t flags = 0;
655
656 //
657 // Debug - slide failing test cases early
658 //
659#if 0
660 {
661 }
662 return;
663#endif
664
665 //
666 // Simple pattern compilation
667 //
668 {
669 UnicodeString re("abc");
670 RegexPattern *pat2;
671 pat2 = RegexPattern::compile(re, flags, pe, status);
672 REGEX_CHECK_STATUS;
374ca955 673
b75a7d8f
A
674 UnicodeString inStr1 = "abcdef this is a test";
675 UnicodeString instr2 = "not abc";
676 UnicodeString empty = "";
374ca955
A
677
678
b75a7d8f
A
679 //
680 // Matcher creation and reset.
681 //
682 RegexMatcher *m1 = pat2->matcher(inStr1, status);
683 REGEX_CHECK_STATUS;
374ca955 684 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f
A
685 REGEX_ASSERT(m1->input() == inStr1);
686 m1->reset(instr2);
687 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
688 REGEX_ASSERT(m1->input() == instr2);
689 m1->reset(inStr1);
690 REGEX_ASSERT(m1->input() == inStr1);
691 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
692 m1->reset(empty);
693 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
694 REGEX_ASSERT(m1->input() == empty);
695 REGEX_ASSERT(&m1->pattern() == pat2);
374ca955
A
696
697 //
698 // reset(pos, status)
699 //
700 m1->reset(inStr1);
701 m1->reset(4, status);
702 REGEX_CHECK_STATUS;
703 REGEX_ASSERT(m1->input() == inStr1);
704 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
705
706 m1->reset(-1, status);
707 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
708 status = U_ZERO_ERROR;
709
710 m1->reset(0, status);
711 REGEX_CHECK_STATUS;
712 status = U_ZERO_ERROR;
713
714 int32_t len = m1->input().length();
715 m1->reset(len-1, status);
716 REGEX_CHECK_STATUS;
717 status = U_ZERO_ERROR;
718
719 m1->reset(len, status);
729e4ab9
A
720 REGEX_CHECK_STATUS;
721 status = U_ZERO_ERROR;
722
723 m1->reset(len+1, status);
374ca955
A
724 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
725 status = U_ZERO_ERROR;
726
727 //
728 // match(pos, status)
729 //
730 m1->reset(instr2);
731 REGEX_ASSERT(m1->matches(4, status) == TRUE);
732 m1->reset();
733 REGEX_ASSERT(m1->matches(3, status) == FALSE);
734 m1->reset();
735 REGEX_ASSERT(m1->matches(5, status) == FALSE);
736 REGEX_ASSERT(m1->matches(4, status) == TRUE);
737 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
738 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
739
740 // Match() at end of string should fail, but should not
741 // be an error.
742 status = U_ZERO_ERROR;
743 len = m1->input().length();
744 REGEX_ASSERT(m1->matches(len, status) == FALSE);
745 REGEX_CHECK_STATUS;
746
747 // Match beyond end of string should fail with an error.
748 status = U_ZERO_ERROR;
749 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
750 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
751
752 // Successful match at end of string.
753 {
754 status = U_ZERO_ERROR;
755 RegexMatcher m("A?", 0, status); // will match zero length string.
756 REGEX_CHECK_STATUS;
757 m.reset(inStr1);
758 len = inStr1.length();
759 REGEX_ASSERT(m.matches(len, status) == TRUE);
760 REGEX_CHECK_STATUS;
761 m.reset(empty);
762 REGEX_ASSERT(m.matches(0, status) == TRUE);
763 REGEX_CHECK_STATUS;
764 }
765
766
767 //
768 // lookingAt(pos, status)
769 //
770 status = U_ZERO_ERROR;
771 m1->reset(instr2); // "not abc"
772 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
773 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
774 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
775 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
776 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
778 status = U_ZERO_ERROR;
779 len = m1->input().length();
780 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
781 REGEX_CHECK_STATUS;
782 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
783 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
784
b75a7d8f
A
785 delete m1;
786 delete pat2;
787 }
788
789
790 //
374ca955 791 // Capture Group.
b75a7d8f
A
792 // RegexMatcher::start();
793 // RegexMatcher::end();
794 // RegexMatcher::groupCount();
795 //
796 {
797 int32_t flags=0;
798 UParseError pe;
799 UErrorCode status=U_ZERO_ERROR;
800
801 UnicodeString re("01(23(45)67)(.*)");
802 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
803 REGEX_CHECK_STATUS;
804 UnicodeString data = "0123456789";
374ca955 805
b75a7d8f
A
806 RegexMatcher *matcher = pat->matcher(data, status);
807 REGEX_CHECK_STATUS;
374ca955 808 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
46f4442e
A
809 static const int32_t matchStarts[] = {0, 2, 4, 8};
810 static const int32_t matchEnds[] = {10, 8, 6, 10};
811 int32_t i;
b75a7d8f
A
812 for (i=0; i<4; i++) {
813 int32_t actualStart = matcher->start(i, status);
814 REGEX_CHECK_STATUS;
815 if (actualStart != matchStarts[i]) {
816 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
817 __LINE__, i, matchStarts[i], actualStart);
818 }
819 int32_t actualEnd = matcher->end(i, status);
820 REGEX_CHECK_STATUS;
821 if (actualEnd != matchEnds[i]) {
822 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
823 __LINE__, i, matchEnds[i], actualEnd);
824 }
825 }
826
827 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
828 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
829
830 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
831 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
832 matcher->reset();
833 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
834
835 matcher->lookingAt(status);
836 REGEX_ASSERT(matcher->group(status) == "0123456789");
837 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
838 REGEX_ASSERT(matcher->group(1, status) == "234567" );
839 REGEX_ASSERT(matcher->group(2, status) == "45" );
840 REGEX_ASSERT(matcher->group(3, status) == "89" );
841 REGEX_CHECK_STATUS;
842 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
843 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
844 matcher->reset();
845 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
846
847 delete matcher;
848 delete pat;
849
850 }
851
852 //
853 // find
854 //
855 {
856 int32_t flags=0;
857 UParseError pe;
858 UErrorCode status=U_ZERO_ERROR;
859
860 UnicodeString re("abc");
861 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
862 REGEX_CHECK_STATUS;
863 UnicodeString data = ".abc..abc...abc..";
864 // 012345678901234567
374ca955 865
b75a7d8f
A
866 RegexMatcher *matcher = pat->matcher(data, status);
867 REGEX_CHECK_STATUS;
868 REGEX_ASSERT(matcher->find());
869 REGEX_ASSERT(matcher->start(status) == 1);
870 REGEX_ASSERT(matcher->find());
871 REGEX_ASSERT(matcher->start(status) == 6);
872 REGEX_ASSERT(matcher->find());
873 REGEX_ASSERT(matcher->start(status) == 12);
874 REGEX_ASSERT(matcher->find() == FALSE);
875 REGEX_ASSERT(matcher->find() == FALSE);
876
877 matcher->reset();
878 REGEX_ASSERT(matcher->find());
879 REGEX_ASSERT(matcher->start(status) == 1);
880
881 REGEX_ASSERT(matcher->find(0, status));
882 REGEX_ASSERT(matcher->start(status) == 1);
883 REGEX_ASSERT(matcher->find(1, status));
884 REGEX_ASSERT(matcher->start(status) == 1);
885 REGEX_ASSERT(matcher->find(2, status));
886 REGEX_ASSERT(matcher->start(status) == 6);
887 REGEX_ASSERT(matcher->find(12, status));
888 REGEX_ASSERT(matcher->start(status) == 12);
889 REGEX_ASSERT(matcher->find(13, status) == FALSE);
890 REGEX_ASSERT(matcher->find(16, status) == FALSE);
374ca955 891 REGEX_ASSERT(matcher->find(17, status) == FALSE);
b75a7d8f 892 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
b75a7d8f 893
374ca955 894 status = U_ZERO_ERROR;
b75a7d8f 895 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
896 status = U_ZERO_ERROR;
897 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f
A
898
899 REGEX_ASSERT(matcher->groupCount() == 0);
900
901 delete matcher;
902 delete pat;
903 }
904
905
906 //
907 // find, with \G in pattern (true if at the end of a previous match).
908 //
909 {
910 int32_t flags=0;
911 UParseError pe;
912 UErrorCode status=U_ZERO_ERROR;
913
46f4442e 914 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
b75a7d8f
A
915 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
916 REGEX_CHECK_STATUS;
917 UnicodeString data = ".abcabc.abc..";
918 // 012345678901234567
374ca955 919
b75a7d8f
A
920 RegexMatcher *matcher = pat->matcher(data, status);
921 REGEX_CHECK_STATUS;
922 REGEX_ASSERT(matcher->find());
923 REGEX_ASSERT(matcher->start(status) == 0);
374ca955 924 REGEX_ASSERT(matcher->start(1, status) == -1);
b75a7d8f
A
925 REGEX_ASSERT(matcher->start(2, status) == 1);
926
927 REGEX_ASSERT(matcher->find());
928 REGEX_ASSERT(matcher->start(status) == 4);
374ca955 929 REGEX_ASSERT(matcher->start(1, status) == 4);
b75a7d8f
A
930 REGEX_ASSERT(matcher->start(2, status) == -1);
931 REGEX_CHECK_STATUS;
932
933 delete matcher;
934 delete pat;
935 }
936
374ca955
A
937 //
938 // find with zero length matches, match position should bump ahead
939 // to prevent loops.
940 //
941 {
46f4442e 942 int32_t i;
374ca955
A
943 UErrorCode status=U_ZERO_ERROR;
944 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
945 // using an always-true look-ahead.
946 REGEX_CHECK_STATUS;
947 UnicodeString s(" ");
948 m.reset(s);
949 for (i=0; ; i++) {
950 if (m.find() == FALSE) {
951 break;
952 }
953 REGEX_ASSERT(m.start(status) == i);
954 REGEX_ASSERT(m.end(status) == i);
955 }
956 REGEX_ASSERT(i==5);
957
958 // Check that the bump goes over surrogate pairs OK
46f4442e 959 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
374ca955
A
960 s = s.unescape();
961 m.reset(s);
962 for (i=0; ; i+=2) {
963 if (m.find() == FALSE) {
964 break;
965 }
966 REGEX_ASSERT(m.start(status) == i);
967 REGEX_ASSERT(m.end(status) == i);
968 }
969 REGEX_ASSERT(i==10);
970 }
971 {
972 // find() loop breaking test.
973 // with pattern of /.?/, should see a series of one char matches, then a single
974 // match of zero length at the end of the input string.
46f4442e 975 int32_t i;
374ca955
A
976 UErrorCode status=U_ZERO_ERROR;
977 RegexMatcher m(".?", 0, status);
978 REGEX_CHECK_STATUS;
979 UnicodeString s(" ");
980 m.reset(s);
981 for (i=0; ; i++) {
982 if (m.find() == FALSE) {
983 break;
984 }
985 REGEX_ASSERT(m.start(status) == i);
986 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
987 }
988 REGEX_ASSERT(i==5);
989 }
990
991
b75a7d8f
A
992 //
993 // Matchers with no input string behave as if they had an empty input string.
994 //
995
996 {
997 UErrorCode status = U_ZERO_ERROR;
998 RegexMatcher m(".?", 0, status);
999 REGEX_CHECK_STATUS;
1000 REGEX_ASSERT(m.find());
1001 REGEX_ASSERT(m.start(status) == 0);
1002 REGEX_ASSERT(m.input() == "");
1003 }
1004 {
1005 UErrorCode status = U_ZERO_ERROR;
1006 RegexPattern *p = RegexPattern::compile(".", 0, status);
1007 RegexMatcher *m = p->matcher(status);
1008 REGEX_CHECK_STATUS;
374ca955 1009
b75a7d8f
A
1010 REGEX_ASSERT(m->find() == FALSE);
1011 REGEX_ASSERT(m->input() == "");
1012 delete m;
1013 delete p;
1014 }
46f4442e
A
1015
1016 //
1017 // Regions
1018 //
1019 {
1020 UErrorCode status = U_ZERO_ERROR;
1021 UnicodeString testString("This is test data");
1022 RegexMatcher m(".*", testString, 0, status);
1023 REGEX_CHECK_STATUS;
1024 REGEX_ASSERT(m.regionStart() == 0);
1025 REGEX_ASSERT(m.regionEnd() == testString.length());
1026 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1027 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1028
1029 m.region(2,4, status);
1030 REGEX_CHECK_STATUS;
1031 REGEX_ASSERT(m.matches(status));
1032 REGEX_ASSERT(m.start(status)==2);
1033 REGEX_ASSERT(m.end(status)==4);
1034 REGEX_CHECK_STATUS;
1035
1036 m.reset();
1037 REGEX_ASSERT(m.regionStart() == 0);
1038 REGEX_ASSERT(m.regionEnd() == testString.length());
1039
1040 UnicodeString shorterString("short");
1041 m.reset(shorterString);
1042 REGEX_ASSERT(m.regionStart() == 0);
1043 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1044
1045 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1046 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1047 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1048 REGEX_ASSERT(&m == &m.reset());
1049 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1050
1051 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1052 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1053 REGEX_ASSERT(&m == &m.reset());
1054 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1055
1056 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1057 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1058 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1059 REGEX_ASSERT(&m == &m.reset());
1060 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1061
1062 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1063 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1064 REGEX_ASSERT(&m == &m.reset());
1065 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1066
1067 }
1068
1069 //
1070 // hitEnd() and requireEnd()
1071 //
1072 {
1073 UErrorCode status = U_ZERO_ERROR;
1074 UnicodeString testString("aabb");
1075 RegexMatcher m1(".*", testString, 0, status);
1076 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1077 REGEX_ASSERT(m1.hitEnd() == TRUE);
1078 REGEX_ASSERT(m1.requireEnd() == FALSE);
1079 REGEX_CHECK_STATUS;
1080
1081 status = U_ZERO_ERROR;
1082 RegexMatcher m2("a*", testString, 0, status);
1083 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1084 REGEX_ASSERT(m2.hitEnd() == FALSE);
1085 REGEX_ASSERT(m2.requireEnd() == FALSE);
1086 REGEX_CHECK_STATUS;
1087
1088 status = U_ZERO_ERROR;
1089 RegexMatcher m3(".*$", testString, 0, status);
1090 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1091 REGEX_ASSERT(m3.hitEnd() == TRUE);
1092 REGEX_ASSERT(m3.requireEnd() == TRUE);
1093 REGEX_CHECK_STATUS;
1094 }
1095
b75a7d8f 1096
374ca955
A
1097 //
1098 // Compilation error on reset with UChar *
1099 // These were a hazard that people were stumbling over with runtime errors.
1100 // Changed them to compiler errors by adding private methods that more closely
1101 // matched the incorrect use of the functions.
1102 //
1103#if 0
1104 {
1105 UErrorCode status = U_ZERO_ERROR;
1106 UChar ucharString[20];
1107 RegexMatcher m(".", 0, status);
1108 m.reset(ucharString); // should not compile.
1109
1110 RegexPattern *p = RegexPattern::compile(".", 0, status);
1111 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1112
1113 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1114 }
1115#endif
1116
46f4442e
A
1117 //
1118 // Time Outs.
1119 // Note: These tests will need to be changed when the regexp engine is
1120 // able to detect and cut short the exponential time behavior on
1121 // this type of match.
1122 //
1123 {
1124 UErrorCode status = U_ZERO_ERROR;
1125 // Enough 'a's in the string to cause the match to time out.
1126 // (Each on additonal 'a' doubles the time)
1127 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1128 RegexMatcher matcher("(a+)+b", testString, 0, status);
1129 REGEX_CHECK_STATUS;
1130 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1131 matcher.setTimeLimit(100, status);
1132 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1133 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1134 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1135 }
1136 {
1137 UErrorCode status = U_ZERO_ERROR;
1138 // Few enough 'a's to slip in under the time limit.
1139 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1140 RegexMatcher matcher("(a+)+b", testString, 0, status);
1141 REGEX_CHECK_STATUS;
1142 matcher.setTimeLimit(100, status);
1143 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1144 REGEX_CHECK_STATUS;
1145 }
1146
1147 //
1148 // Stack Limits
1149 //
1150 {
1151 UErrorCode status = U_ZERO_ERROR;
729e4ab9 1152 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
46f4442e
A
1153
1154 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1155 // of the '+', and makes the stack frames larger.
1156 RegexMatcher matcher("(A)+A$", testString, 0, status);
1157
1158 // With the default stack, this match should fail to run
1159 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1160 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1161
1162 // With unlimited stack, it should run
1163 status = U_ZERO_ERROR;
1164 matcher.setStackLimit(0, status);
1165 REGEX_CHECK_STATUS;
1166 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1167 REGEX_CHECK_STATUS;
1168 REGEX_ASSERT(matcher.getStackLimit() == 0);
1169
1170 // With a limited stack, it the match should fail
1171 status = U_ZERO_ERROR;
1172 matcher.setStackLimit(10000, status);
1173 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1174 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1175 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1176 }
1177
1178 // A pattern that doesn't save state should work with
1179 // a minimal sized stack
1180 {
1181 UErrorCode status = U_ZERO_ERROR;
1182 UnicodeString testString = "abc";
1183 RegexMatcher matcher("abc", testString, 0, status);
1184 REGEX_CHECK_STATUS;
1185 matcher.setStackLimit(30, status);
1186 REGEX_CHECK_STATUS;
1187 REGEX_ASSERT(matcher.matches(status) == TRUE);
1188 REGEX_CHECK_STATUS;
1189 REGEX_ASSERT(matcher.getStackLimit() == 30);
1190
1191 // Negative stack sizes should fail
1192 status = U_ZERO_ERROR;
1193 matcher.setStackLimit(1000, status);
1194 REGEX_CHECK_STATUS;
1195 matcher.setStackLimit(-1, status);
1196 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1197 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1198 }
1199
1200
b75a7d8f
A
1201}
1202
1203
1204
1205
1206
1207
1208//---------------------------------------------------------------------------
1209//
374ca955 1210// API_Replace API test for class RegexMatcher, testing the
b75a7d8f
A
1211// Replace family of functions.
1212//
1213//---------------------------------------------------------------------------
1214void RegexTest::API_Replace() {
1215 //
1216 // Replace
1217 //
1218 int32_t flags=0;
1219 UParseError pe;
1220 UErrorCode status=U_ZERO_ERROR;
374ca955 1221
b75a7d8f
A
1222 UnicodeString re("abc");
1223 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1224 REGEX_CHECK_STATUS;
1225 UnicodeString data = ".abc..abc...abc..";
1226 // 012345678901234567
1227 RegexMatcher *matcher = pat->matcher(data, status);
374ca955 1228
b75a7d8f
A
1229 //
1230 // Plain vanilla matches.
1231 //
1232 UnicodeString dest;
1233 dest = matcher->replaceFirst("yz", status);
1234 REGEX_CHECK_STATUS;
1235 REGEX_ASSERT(dest == ".yz..abc...abc..");
374ca955 1236
b75a7d8f
A
1237 dest = matcher->replaceAll("yz", status);
1238 REGEX_CHECK_STATUS;
1239 REGEX_ASSERT(dest == ".yz..yz...yz..");
374ca955 1240
b75a7d8f
A
1241 //
1242 // Plain vanilla non-matches.
1243 //
1244 UnicodeString d2 = ".abx..abx...abx..";
1245 matcher->reset(d2);
1246 dest = matcher->replaceFirst("yz", status);
1247 REGEX_CHECK_STATUS;
1248 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1249
b75a7d8f
A
1250 dest = matcher->replaceAll("yz", status);
1251 REGEX_CHECK_STATUS;
1252 REGEX_ASSERT(dest == ".abx..abx...abx..");
374ca955 1253
b75a7d8f
A
1254 //
1255 // Empty source string
1256 //
1257 UnicodeString d3 = "";
1258 matcher->reset(d3);
1259 dest = matcher->replaceFirst("yz", status);
1260 REGEX_CHECK_STATUS;
1261 REGEX_ASSERT(dest == "");
374ca955 1262
b75a7d8f
A
1263 dest = matcher->replaceAll("yz", status);
1264 REGEX_CHECK_STATUS;
1265 REGEX_ASSERT(dest == "");
374ca955 1266
b75a7d8f
A
1267 //
1268 // Empty substitution string
1269 //
1270 matcher->reset(data); // ".abc..abc...abc.."
1271 dest = matcher->replaceFirst("", status);
1272 REGEX_CHECK_STATUS;
1273 REGEX_ASSERT(dest == "...abc...abc..");
374ca955 1274
b75a7d8f
A
1275 dest = matcher->replaceAll("", status);
1276 REGEX_CHECK_STATUS;
1277 REGEX_ASSERT(dest == "........");
374ca955 1278
b75a7d8f
A
1279 //
1280 // match whole string
1281 //
1282 UnicodeString d4 = "abc";
374ca955 1283 matcher->reset(d4);
b75a7d8f
A
1284 dest = matcher->replaceFirst("xyz", status);
1285 REGEX_CHECK_STATUS;
1286 REGEX_ASSERT(dest == "xyz");
374ca955 1287
b75a7d8f
A
1288 dest = matcher->replaceAll("xyz", status);
1289 REGEX_CHECK_STATUS;
1290 REGEX_ASSERT(dest == "xyz");
374ca955 1291
b75a7d8f
A
1292 //
1293 // Capture Group, simple case
1294 //
1295 UnicodeString re2("a(..)");
1296 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1297 REGEX_CHECK_STATUS;
1298 UnicodeString d5 = "abcdefg";
1299 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1300 REGEX_CHECK_STATUS;
1301 dest = matcher2->replaceFirst("$1$1", status);
1302 REGEX_CHECK_STATUS;
1303 REGEX_ASSERT(dest == "bcbcdefg");
1304
46f4442e 1305 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
b75a7d8f
A
1306 REGEX_CHECK_STATUS;
1307 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1308
1309 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1310 REGEX_CHECK_STATUS;
1311 REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1312
46f4442e 1313 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
b75a7d8f
A
1314 replacement = replacement.unescape();
1315 dest = matcher2->replaceFirst(replacement, status);
1316 REGEX_CHECK_STATUS;
1317 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
374ca955 1318
b75a7d8f 1319 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
374ca955
A
1320
1321
1322 //
1323 // Replacement String with \u hex escapes
1324 //
1325 {
1326 UnicodeString src = "abc 1 abc 2 abc 3";
46f4442e 1327 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
374ca955
A
1328 matcher->reset(src);
1329 UnicodeString result = matcher->replaceAll(substitute, status);
1330 REGEX_CHECK_STATUS;
1331 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1332 }
1333 {
1334 UnicodeString src = "abc !";
46f4442e 1335 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
374ca955
A
1336 matcher->reset(src);
1337 UnicodeString result = matcher->replaceAll(substitute, status);
1338 REGEX_CHECK_STATUS;
1339 UnicodeString expected = UnicodeString("--");
1340 expected.append((UChar32)0x10000);
1341 expected.append("-- !");
1342 REGEX_ASSERT(result == expected);
1343 }
b75a7d8f 1344 // TODO: need more through testing of capture substitutions.
374ca955
A
1345
1346 // Bug 4057
1347 //
1348 {
1349 status = U_ZERO_ERROR;
1350 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1351 RegexMatcher m("ss(.*?)ee", 0, status);
1352 REGEX_CHECK_STATUS;
1353 UnicodeString result;
1354
1355 // Multiple finds do NOT bump up the previous appendReplacement postion.
1356 m.reset(s);
1357 m.find();
1358 m.find();
1359 m.appendReplacement(result, "ooh", status);
1360 REGEX_CHECK_STATUS;
1361 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1362
1363 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1364 status = U_ZERO_ERROR;
1365 result.truncate(0);
1366 m.reset(10, status);
1367 m.find();
1368 m.find();
1369 m.appendReplacement(result, "ooh", status);
1370 REGEX_CHECK_STATUS;
1371 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1372
1373 // find() at interior of string, appendReplacemnt still starts at beginning.
1374 status = U_ZERO_ERROR;
1375 result.truncate(0);
1376 m.reset();
1377 m.find(10, status);
1378 m.find();
1379 m.appendReplacement(result, "ooh", status);
1380 REGEX_CHECK_STATUS;
1381 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1382
1383 m.appendTail(result);
1384 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1385
1386 }
1387
b75a7d8f
A
1388 delete matcher2;
1389 delete pat2;
1390 delete matcher;
1391 delete pat;
1392}
1393
1394
1395//---------------------------------------------------------------------------
1396//
1397// API_Pattern Test that the API for class RegexPattern is
1398// present and nominally working.
1399//
1400//---------------------------------------------------------------------------
1401void RegexTest::API_Pattern() {
1402 RegexPattern pata; // Test default constructor to not crash.
1403 RegexPattern patb;
1404
1405 REGEX_ASSERT(pata == patb);
1406 REGEX_ASSERT(pata == pata);
1407
1408 UnicodeString re1("abc[a-l][m-z]");
1409 UnicodeString re2("def");
1410 UErrorCode status = U_ZERO_ERROR;
1411 UParseError pe;
1412
1413 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1414 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1415 REGEX_CHECK_STATUS;
1416 REGEX_ASSERT(*pat1 == *pat1);
1417 REGEX_ASSERT(*pat1 != pata);
1418
1419 // Assign
1420 patb = *pat1;
1421 REGEX_ASSERT(patb == *pat1);
1422
1423 // Copy Construct
1424 RegexPattern patc(*pat1);
1425 REGEX_ASSERT(patc == *pat1);
1426 REGEX_ASSERT(patb == patc);
1427 REGEX_ASSERT(pat1 != pat2);
1428 patb = *pat2;
1429 REGEX_ASSERT(patb != patc);
1430 REGEX_ASSERT(patb == *pat2);
1431
1432 // Compile with no flags.
1433 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1434 REGEX_ASSERT(*pat1a == *pat1);
1435
1436 REGEX_ASSERT(pat1a->flags() == 0);
374ca955 1437
b75a7d8f
A
1438 // Compile with different flags should be not equal
1439 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1440 REGEX_CHECK_STATUS;
1441
1442 REGEX_ASSERT(*pat1b != *pat1a);
1443 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1444 REGEX_ASSERT(pat1a->flags() == 0);
1445 delete pat1b;
b75a7d8f
A
1446
1447 // clone
1448 RegexPattern *pat1c = pat1->clone();
1449 REGEX_ASSERT(*pat1c == *pat1);
1450 REGEX_ASSERT(*pat1c != *pat2);
1451
b75a7d8f
A
1452 delete pat1c;
1453 delete pat1a;
1454 delete pat1;
1455 delete pat2;
1456
1457
374ca955
A
1458 //
1459 // Verify that a matcher created from a cloned pattern works.
1460 // (Jitterbug 3423)
1461 //
1462 {
1463 UErrorCode status = U_ZERO_ERROR;
46f4442e 1464 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
374ca955
A
1465 RegexPattern *pClone = pSource->clone();
1466 delete pSource;
1467 RegexMatcher *mFromClone = pClone->matcher(status);
1468 REGEX_CHECK_STATUS;
1469 UnicodeString s = "Hello World";
1470 mFromClone->reset(s);
1471 REGEX_ASSERT(mFromClone->find() == TRUE);
1472 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1473 REGEX_ASSERT(mFromClone->find() == TRUE);
1474 REGEX_ASSERT(mFromClone->group(status) == "World");
1475 REGEX_ASSERT(mFromClone->find() == FALSE);
1476 delete mFromClone;
1477 delete pClone;
1478 }
1479
b75a7d8f
A
1480 //
1481 // matches convenience API
1482 //
1483 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1484 REGEX_CHECK_STATUS;
1485 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1486 REGEX_CHECK_STATUS;
1487 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1488 REGEX_CHECK_STATUS;
1489 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1490 REGEX_CHECK_STATUS;
1491 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1492 REGEX_CHECK_STATUS;
1493 status = U_INDEX_OUTOFBOUNDS_ERROR;
1494 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1495 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1496
1497
1498 //
1499 // Split()
1500 //
1501 status = U_ZERO_ERROR;
1502 pat1 = RegexPattern::compile(" +", pe, status);
1503 REGEX_CHECK_STATUS;
1504 UnicodeString fields[10];
1505
1506 int32_t n;
1507 n = pat1->split("Now is the time", fields, 10, status);
1508 REGEX_CHECK_STATUS;
1509 REGEX_ASSERT(n==4);
1510 REGEX_ASSERT(fields[0]=="Now");
1511 REGEX_ASSERT(fields[1]=="is");
1512 REGEX_ASSERT(fields[2]=="the");
1513 REGEX_ASSERT(fields[3]=="time");
1514 REGEX_ASSERT(fields[4]=="");
1515
1516 n = pat1->split("Now is the time", fields, 2, status);
1517 REGEX_CHECK_STATUS;
1518 REGEX_ASSERT(n==2);
1519 REGEX_ASSERT(fields[0]=="Now");
1520 REGEX_ASSERT(fields[1]=="is the time");
1521 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1522
1523 fields[1] = "*";
1524 status = U_ZERO_ERROR;
1525 n = pat1->split("Now is the time", fields, 1, status);
1526 REGEX_CHECK_STATUS;
1527 REGEX_ASSERT(n==1);
1528 REGEX_ASSERT(fields[0]=="Now is the time");
1529 REGEX_ASSERT(fields[1]=="*");
1530 status = U_ZERO_ERROR;
1531
1532 n = pat1->split(" Now is the time ", fields, 10, status);
1533 REGEX_CHECK_STATUS;
1534 REGEX_ASSERT(n==5);
1535 REGEX_ASSERT(fields[0]=="");
1536 REGEX_ASSERT(fields[1]=="Now");
1537 REGEX_ASSERT(fields[2]=="is");
1538 REGEX_ASSERT(fields[3]=="the");
1539 REGEX_ASSERT(fields[4]=="time");
1540 REGEX_ASSERT(fields[5]=="");
1541
1542 n = pat1->split(" ", fields, 10, status);
1543 REGEX_CHECK_STATUS;
1544 REGEX_ASSERT(n==1);
1545 REGEX_ASSERT(fields[0]=="");
1546
1547 fields[0] = "foo";
1548 n = pat1->split("", fields, 10, status);
1549 REGEX_CHECK_STATUS;
1550 REGEX_ASSERT(n==0);
1551 REGEX_ASSERT(fields[0]=="foo");
1552
1553 delete pat1;
1554
1555 // split, with a pattern with (capture)
46f4442e 1556 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
b75a7d8f
A
1557 REGEX_CHECK_STATUS;
1558
1559 status = U_ZERO_ERROR;
1560 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1561 REGEX_CHECK_STATUS;
1562 REGEX_ASSERT(n==6);
1563 REGEX_ASSERT(fields[0]=="");
1564 REGEX_ASSERT(fields[1]=="a");
1565 REGEX_ASSERT(fields[2]=="Now is ");
1566 REGEX_ASSERT(fields[3]=="b");
1567 REGEX_ASSERT(fields[4]=="the time");
1568 REGEX_ASSERT(fields[5]=="c");
1569 REGEX_ASSERT(fields[6]=="");
1570 REGEX_ASSERT(status==U_ZERO_ERROR);
1571
1572 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1573 REGEX_CHECK_STATUS;
1574 REGEX_ASSERT(n==6);
1575 REGEX_ASSERT(fields[0]==" ");
1576 REGEX_ASSERT(fields[1]=="a");
1577 REGEX_ASSERT(fields[2]=="Now is ");
1578 REGEX_ASSERT(fields[3]=="b");
1579 REGEX_ASSERT(fields[4]=="the time");
1580 REGEX_ASSERT(fields[5]=="c");
1581 REGEX_ASSERT(fields[6]=="");
1582
1583 status = U_ZERO_ERROR;
1584 fields[6] = "foo";
1585 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1586 REGEX_CHECK_STATUS;
1587 REGEX_ASSERT(n==6);
1588 REGEX_ASSERT(fields[0]==" ");
1589 REGEX_ASSERT(fields[1]=="a");
1590 REGEX_ASSERT(fields[2]=="Now is ");
1591 REGEX_ASSERT(fields[3]=="b");
1592 REGEX_ASSERT(fields[4]=="the time");
1593 REGEX_ASSERT(fields[5]=="c");
1594 REGEX_ASSERT(fields[6]=="foo");
1595
1596 status = U_ZERO_ERROR;
1597 fields[5] = "foo";
1598 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1599 REGEX_CHECK_STATUS;
1600 REGEX_ASSERT(n==5);
1601 REGEX_ASSERT(fields[0]==" ");
1602 REGEX_ASSERT(fields[1]=="a");
1603 REGEX_ASSERT(fields[2]=="Now is ");
1604 REGEX_ASSERT(fields[3]=="b");
1605 REGEX_ASSERT(fields[4]=="the time<c>");
1606 REGEX_ASSERT(fields[5]=="foo");
1607
1608 status = U_ZERO_ERROR;
1609 fields[5] = "foo";
1610 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1611 REGEX_CHECK_STATUS;
1612 REGEX_ASSERT(n==5);
1613 REGEX_ASSERT(fields[0]==" ");
1614 REGEX_ASSERT(fields[1]=="a");
1615 REGEX_ASSERT(fields[2]=="Now is ");
1616 REGEX_ASSERT(fields[3]=="b");
1617 REGEX_ASSERT(fields[4]=="the time");
1618 REGEX_ASSERT(fields[5]=="foo");
1619
1620 status = U_ZERO_ERROR;
1621 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(n==4);
1624 REGEX_ASSERT(fields[0]==" ");
1625 REGEX_ASSERT(fields[1]=="a");
1626 REGEX_ASSERT(fields[2]=="Now is ");
1627 REGEX_ASSERT(fields[3]=="the time<c>");
1628 status = U_ZERO_ERROR;
1629 delete pat1;
1630
1631 pat1 = RegexPattern::compile("([-,])", pe, status);
1632 REGEX_CHECK_STATUS;
1633 n = pat1->split("1-10,20", fields, 10, status);
1634 REGEX_CHECK_STATUS;
1635 REGEX_ASSERT(n==5);
1636 REGEX_ASSERT(fields[0]=="1");
1637 REGEX_ASSERT(fields[1]=="-");
1638 REGEX_ASSERT(fields[2]=="10");
1639 REGEX_ASSERT(fields[3]==",");
1640 REGEX_ASSERT(fields[4]=="20");
1641 delete pat1;
1642
1643
1644 //
1645 // RegexPattern::pattern()
1646 //
1647 pat1 = new RegexPattern();
1648 REGEX_ASSERT(pat1->pattern() == "");
1649 delete pat1;
1650
1651 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1652 REGEX_CHECK_STATUS;
1653 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1654 delete pat1;
1655
1656
1657 //
1658 // classID functions
1659 //
1660 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1661 REGEX_CHECK_STATUS;
1662 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1663 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
374ca955
A
1664 UnicodeString Hello("Hello, world.");
1665 RegexMatcher *m = pat1->matcher(Hello, status);
b75a7d8f
A
1666 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1667 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1668 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1669 delete m;
1670 delete pat1;
1671
1672}
1673
1674//---------------------------------------------------------------------------
1675//
729e4ab9
A
1676// API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1677// is present and working, but excluding functions
1678// implementing replace operations.
b75a7d8f
A
1679//
1680//---------------------------------------------------------------------------
729e4ab9
A
1681void RegexTest::API_Match_UTF8() {
1682 UParseError pe;
1683 UErrorCode status=U_ZERO_ERROR;
1684 int32_t flags = 0;
b75a7d8f
A
1685
1686 //
729e4ab9 1687 // Debug - slide failing test cases early
b75a7d8f 1688 //
729e4ab9
A
1689#if 0
1690 {
374ca955 1691 }
729e4ab9
A
1692 return;
1693#endif
b75a7d8f
A
1694
1695 //
729e4ab9 1696 // Simple pattern compilation
b75a7d8f 1697 //
729e4ab9
A
1698 {
1699 UText re = UTEXT_INITIALIZER;
1700 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1701 RegexPattern *pat2;
1702 pat2 = RegexPattern::compile(&re, flags, pe, status);
1703 REGEX_CHECK_STATUS;
b75a7d8f 1704
729e4ab9
A
1705 UText input1 = UTEXT_INITIALIZER;
1706 UText input2 = UTEXT_INITIALIZER;
1707 UText empty = UTEXT_INITIALIZER;
1708 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1709 REGEX_VERBOSE_TEXT(&input1);
1710 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1711 REGEX_VERBOSE_TEXT(&input2);
1712 utext_openUChars(&empty, NULL, 0, &status);
1713
1714 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1715 int32_t input2Len = strlen("not abc");
b75a7d8f 1716
b75a7d8f 1717
729e4ab9
A
1718 //
1719 // Matcher creation and reset.
1720 //
1721 RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1722 REGEX_CHECK_STATUS;
1723 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1724 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1725 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1726 m1->reset(&input2);
1727 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1728 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1729 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1730 m1->reset(&input1);
1731 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1732 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1733 m1->reset(&empty);
1734 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1735 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
b75a7d8f 1736
729e4ab9
A
1737 //
1738 // reset(pos, status)
1739 //
1740 m1->reset(&input1);
1741 m1->reset(4, status);
1742 REGEX_CHECK_STATUS;
1743 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1744 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
b75a7d8f 1745
729e4ab9
A
1746 m1->reset(-1, status);
1747 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
b75a7d8f 1748 status = U_ZERO_ERROR;
b75a7d8f 1749
729e4ab9
A
1750 m1->reset(0, status);
1751 REGEX_CHECK_STATUS;
1752 status = U_ZERO_ERROR;
b75a7d8f 1753
729e4ab9
A
1754 m1->reset(input1Len-1, status);
1755 REGEX_CHECK_STATUS;
1756 status = U_ZERO_ERROR;
1757
1758 m1->reset(input1Len, status);
1759 REGEX_CHECK_STATUS;
1760 status = U_ZERO_ERROR;
1761
1762 m1->reset(input1Len+1, status);
1763 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1764 status = U_ZERO_ERROR;
b75a7d8f
A
1765
1766 //
729e4ab9 1767 // match(pos, status)
b75a7d8f 1768 //
729e4ab9
A
1769 m1->reset(&input2);
1770 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1771 m1->reset();
1772 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1773 m1->reset();
1774 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1775 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1776 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1777 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1778
1779 // Match() at end of string should fail, but should not
1780 // be an error.
1781 status = U_ZERO_ERROR;
1782 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1783 REGEX_CHECK_STATUS;
1784
1785 // Match beyond end of string should fail with an error.
1786 status = U_ZERO_ERROR;
1787 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1788 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1789
1790 // Successful match at end of string.
1791 {
1792 status = U_ZERO_ERROR;
1793 RegexMatcher m("A?", 0, status); // will match zero length string.
1794 REGEX_CHECK_STATUS;
1795 m.reset(&input1);
1796 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1797 REGEX_CHECK_STATUS;
1798 m.reset(&empty);
1799 REGEX_ASSERT(m.matches(0, status) == TRUE);
1800 REGEX_CHECK_STATUS;
b75a7d8f
A
1801 }
1802
1803
1804 //
729e4ab9 1805 // lookingAt(pos, status)
b75a7d8f 1806 //
729e4ab9
A
1807 status = U_ZERO_ERROR;
1808 m1->reset(&input2); // "not abc"
1809 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1810 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1811 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1812 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1813 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1814 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1815 status = U_ZERO_ERROR;
1816 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1817 REGEX_CHECK_STATUS;
1818 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1819 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1820
1821 delete m1;
1822 delete pat2;
1823
1824 utext_close(&re);
1825 utext_close(&input1);
1826 utext_close(&input2);
1827 utext_close(&empty);
1828 }
1829
1830
1831 //
1832 // Capture Group.
1833 // RegexMatcher::start();
1834 // RegexMatcher::end();
1835 // RegexMatcher::groupCount();
1836 //
1837 {
1838 int32_t flags=0;
1839 UParseError pe;
1840 UErrorCode status=U_ZERO_ERROR;
1841 UText re=UTEXT_INITIALIZER;
1842 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1843 utext_openUTF8(&re, str_01234567_pat, -1, &status);
1844
1845 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1846 REGEX_CHECK_STATUS;
1847
1848 UText input = UTEXT_INITIALIZER;
1849 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1850 utext_openUTF8(&input, str_0123456789, -1, &status);
1851
1852 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1853 REGEX_CHECK_STATUS;
1854 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1855 static const int32_t matchStarts[] = {0, 2, 4, 8};
1856 static const int32_t matchEnds[] = {10, 8, 6, 10};
1857 int32_t i;
1858 for (i=0; i<4; i++) {
1859 int32_t actualStart = matcher->start(i, status);
1860 REGEX_CHECK_STATUS;
1861 if (actualStart != matchStarts[i]) {
1862 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
1863 __FILE__, __LINE__, i, matchStarts[i], actualStart);
1864 }
1865 int32_t actualEnd = matcher->end(i, status);
1866 REGEX_CHECK_STATUS;
1867 if (actualEnd != matchEnds[i]) {
1868 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
1869 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1870 }
1871 }
1872
1873 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1874 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1875
1876 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1877 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1878 matcher->reset();
1879 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1880
1881 matcher->lookingAt(status);
1882
1883 UnicodeString dest;
1884 UText destText = UTEXT_INITIALIZER;
1885 utext_openUnicodeString(&destText, &dest, &status);
1886 UText *result;
1887 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1888 // Test shallow-clone API
1889 int64_t group_len;
1890 result = matcher->group((UText *)NULL, group_len, status);
1891 REGEX_CHECK_STATUS;
1892 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1893 utext_close(result);
1894 result = matcher->group(0, &destText, group_len, status);
1895 REGEX_CHECK_STATUS;
1896 REGEX_ASSERT(result == &destText);
1897 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1898 // destText is now immutable, reopen it
1899 utext_close(&destText);
1900 utext_openUnicodeString(&destText, &dest, &status);
1901
1902 result = matcher->group(0, NULL, status);
1903 REGEX_CHECK_STATUS;
1904 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1905 utext_close(result);
1906 result = matcher->group(0, &destText, status);
1907 REGEX_CHECK_STATUS;
1908 REGEX_ASSERT(result == &destText);
1909 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1910
1911 result = matcher->group(1, NULL, status);
1912 REGEX_CHECK_STATUS;
1913 const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
1914 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1915 utext_close(result);
1916 result = matcher->group(1, &destText, status);
1917 REGEX_CHECK_STATUS;
1918 REGEX_ASSERT(result == &destText);
1919 REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1920
1921 result = matcher->group(2, NULL, status);
1922 REGEX_CHECK_STATUS;
1923 const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
1924 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1925 utext_close(result);
1926 result = matcher->group(2, &destText, status);
1927 REGEX_CHECK_STATUS;
1928 REGEX_ASSERT(result == &destText);
1929 REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1930
1931 result = matcher->group(3, NULL, status);
1932 REGEX_CHECK_STATUS;
1933 const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
1934 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1935 utext_close(result);
1936 result = matcher->group(3, &destText, status);
1937 REGEX_CHECK_STATUS;
1938 REGEX_ASSERT(result == &destText);
1939 REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1940
1941 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1942 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1943 matcher->reset();
1944 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1945
1946 delete matcher;
1947 delete pat;
1948
1949 utext_close(&destText);
1950 utext_close(&input);
1951 utext_close(&re);
1952 }
1953
1954 //
1955 // find
1956 //
1957 {
1958 int32_t flags=0;
1959 UParseError pe;
1960 UErrorCode status=U_ZERO_ERROR;
1961 UText re=UTEXT_INITIALIZER;
1962 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
1963 utext_openUTF8(&re, str_abc, -1, &status);
1964
1965 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1966 REGEX_CHECK_STATUS;
1967 UText input = UTEXT_INITIALIZER;
1968 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
1969 utext_openUTF8(&input, str_abcabcabc, -1, &status);
1970 // 012345678901234567
1971
1972 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1973 REGEX_CHECK_STATUS;
1974 REGEX_ASSERT(matcher->find());
1975 REGEX_ASSERT(matcher->start(status) == 1);
1976 REGEX_ASSERT(matcher->find());
1977 REGEX_ASSERT(matcher->start(status) == 6);
1978 REGEX_ASSERT(matcher->find());
1979 REGEX_ASSERT(matcher->start(status) == 12);
1980 REGEX_ASSERT(matcher->find() == FALSE);
1981 REGEX_ASSERT(matcher->find() == FALSE);
1982
1983 matcher->reset();
1984 REGEX_ASSERT(matcher->find());
1985 REGEX_ASSERT(matcher->start(status) == 1);
1986
1987 REGEX_ASSERT(matcher->find(0, status));
1988 REGEX_ASSERT(matcher->start(status) == 1);
1989 REGEX_ASSERT(matcher->find(1, status));
1990 REGEX_ASSERT(matcher->start(status) == 1);
1991 REGEX_ASSERT(matcher->find(2, status));
1992 REGEX_ASSERT(matcher->start(status) == 6);
1993 REGEX_ASSERT(matcher->find(12, status));
1994 REGEX_ASSERT(matcher->start(status) == 12);
1995 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1996 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1997 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1998 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1999
2000 status = U_ZERO_ERROR;
2001 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2002 status = U_ZERO_ERROR;
2003 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2004
2005 REGEX_ASSERT(matcher->groupCount() == 0);
2006
2007 delete matcher;
2008 delete pat;
2009
2010 utext_close(&input);
2011 utext_close(&re);
2012 }
2013
2014
2015 //
2016 // find, with \G in pattern (true if at the end of a previous match).
2017 //
2018 {
2019 int32_t flags=0;
2020 UParseError pe;
2021 UErrorCode status=U_ZERO_ERROR;
2022 UText re=UTEXT_INITIALIZER;
2023 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2024 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2025
2026 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2027
2028 REGEX_CHECK_STATUS;
2029 UText input = UTEXT_INITIALIZER;
2030 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2031 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2032 // 012345678901234567
2033
2034 RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
2035 REGEX_CHECK_STATUS;
2036 REGEX_ASSERT(matcher->find());
2037 REGEX_ASSERT(matcher->start(status) == 0);
2038 REGEX_ASSERT(matcher->start(1, status) == -1);
2039 REGEX_ASSERT(matcher->start(2, status) == 1);
2040
2041 REGEX_ASSERT(matcher->find());
2042 REGEX_ASSERT(matcher->start(status) == 4);
2043 REGEX_ASSERT(matcher->start(1, status) == 4);
2044 REGEX_ASSERT(matcher->start(2, status) == -1);
2045 REGEX_CHECK_STATUS;
2046
2047 delete matcher;
2048 delete pat;
2049
2050 utext_close(&input);
2051 utext_close(&re);
2052 }
2053
2054 //
2055 // find with zero length matches, match position should bump ahead
2056 // to prevent loops.
2057 //
2058 {
2059 int32_t i;
2060 UErrorCode status=U_ZERO_ERROR;
2061 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2062 // using an always-true look-ahead.
2063 REGEX_CHECK_STATUS;
2064 UText s = UTEXT_INITIALIZER;
2065 utext_openUTF8(&s, " ", -1, &status);
2066 m.reset(&s);
2067 for (i=0; ; i++) {
2068 if (m.find() == FALSE) {
2069 break;
2070 }
2071 REGEX_ASSERT(m.start(status) == i);
2072 REGEX_ASSERT(m.end(status) == i);
2073 }
2074 REGEX_ASSERT(i==5);
2075
2076 // Check that the bump goes over characters outside the BMP OK
2077 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2078 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2079 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2080 m.reset(&s);
2081 for (i=0; ; i+=4) {
2082 if (m.find() == FALSE) {
2083 break;
2084 }
2085 REGEX_ASSERT(m.start(status) == i);
2086 REGEX_ASSERT(m.end(status) == i);
2087 }
2088 REGEX_ASSERT(i==20);
2089
2090 utext_close(&s);
2091 }
2092 {
2093 // find() loop breaking test.
2094 // with pattern of /.?/, should see a series of one char matches, then a single
2095 // match of zero length at the end of the input string.
2096 int32_t i;
2097 UErrorCode status=U_ZERO_ERROR;
2098 RegexMatcher m(".?", 0, status);
2099 REGEX_CHECK_STATUS;
2100 UText s = UTEXT_INITIALIZER;
2101 utext_openUTF8(&s, " ", -1, &status);
2102 m.reset(&s);
2103 for (i=0; ; i++) {
2104 if (m.find() == FALSE) {
2105 break;
2106 }
2107 REGEX_ASSERT(m.start(status) == i);
2108 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2109 }
2110 REGEX_ASSERT(i==5);
2111
2112 utext_close(&s);
2113 }
2114
2115
2116 //
2117 // Matchers with no input string behave as if they had an empty input string.
2118 //
2119
2120 {
2121 UErrorCode status = U_ZERO_ERROR;
2122 RegexMatcher m(".?", 0, status);
2123 REGEX_CHECK_STATUS;
2124 REGEX_ASSERT(m.find());
2125 REGEX_ASSERT(m.start(status) == 0);
2126 REGEX_ASSERT(m.input() == "");
2127 }
2128 {
2129 UErrorCode status = U_ZERO_ERROR;
2130 RegexPattern *p = RegexPattern::compile(".", 0, status);
2131 RegexMatcher *m = p->matcher(status);
2132 REGEX_CHECK_STATUS;
2133
2134 REGEX_ASSERT(m->find() == FALSE);
2135 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2136 delete m;
2137 delete p;
2138 }
2139
2140 //
2141 // Regions
2142 //
2143 {
2144 UErrorCode status = U_ZERO_ERROR;
2145 UText testPattern = UTEXT_INITIALIZER;
2146 UText testText = UTEXT_INITIALIZER;
2147 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2148 REGEX_VERBOSE_TEXT(&testPattern);
2149 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2150 REGEX_VERBOSE_TEXT(&testText);
2151
2152 RegexMatcher m(&testPattern, &testText, 0, status);
2153 REGEX_CHECK_STATUS;
2154 REGEX_ASSERT(m.regionStart() == 0);
2155 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2156 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2157 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2158
2159 m.region(2,4, status);
2160 REGEX_CHECK_STATUS;
2161 REGEX_ASSERT(m.matches(status));
2162 REGEX_ASSERT(m.start(status)==2);
2163 REGEX_ASSERT(m.end(status)==4);
2164 REGEX_CHECK_STATUS;
2165
2166 m.reset();
2167 REGEX_ASSERT(m.regionStart() == 0);
2168 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2169
2170 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2171 REGEX_VERBOSE_TEXT(&testText);
2172 m.reset(&testText);
2173 REGEX_ASSERT(m.regionStart() == 0);
2174 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2175
2176 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2177 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2178 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2179 REGEX_ASSERT(&m == &m.reset());
2180 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2181
2182 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2183 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2184 REGEX_ASSERT(&m == &m.reset());
2185 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2186
2187 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2188 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2189 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2190 REGEX_ASSERT(&m == &m.reset());
2191 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2192
2193 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2194 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2195 REGEX_ASSERT(&m == &m.reset());
2196 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2197
2198 utext_close(&testText);
2199 utext_close(&testPattern);
2200 }
2201
2202 //
2203 // hitEnd() and requireEnd()
2204 //
2205 {
2206 UErrorCode status = U_ZERO_ERROR;
2207 UText testPattern = UTEXT_INITIALIZER;
2208 UText testText = UTEXT_INITIALIZER;
2209 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2210 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2211 utext_openUTF8(&testPattern, str_, -1, &status);
2212 utext_openUTF8(&testText, str_aabb, -1, &status);
2213
2214 RegexMatcher m1(&testPattern, &testText, 0, status);
2215 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2216 REGEX_ASSERT(m1.hitEnd() == TRUE);
2217 REGEX_ASSERT(m1.requireEnd() == FALSE);
2218 REGEX_CHECK_STATUS;
2219
2220 status = U_ZERO_ERROR;
2221 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2222 utext_openUTF8(&testPattern, str_a, -1, &status);
2223 RegexMatcher m2(&testPattern, &testText, 0, status);
2224 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2225 REGEX_ASSERT(m2.hitEnd() == FALSE);
2226 REGEX_ASSERT(m2.requireEnd() == FALSE);
2227 REGEX_CHECK_STATUS;
2228
2229 status = U_ZERO_ERROR;
2230 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2231 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2232 RegexMatcher m3(&testPattern, &testText, 0, status);
2233 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2234 REGEX_ASSERT(m3.hitEnd() == TRUE);
2235 REGEX_ASSERT(m3.requireEnd() == TRUE);
2236 REGEX_CHECK_STATUS;
2237
2238 utext_close(&testText);
2239 utext_close(&testPattern);
2240 }
2241}
2242
2243
2244//---------------------------------------------------------------------------
2245//
2246// API_Replace_UTF8 API test for class RegexMatcher, testing the
2247// Replace family of functions.
2248//
2249//---------------------------------------------------------------------------
2250void RegexTest::API_Replace_UTF8() {
2251 //
2252 // Replace
2253 //
2254 int32_t flags=0;
2255 UParseError pe;
2256 UErrorCode status=U_ZERO_ERROR;
2257
2258 UText re=UTEXT_INITIALIZER;
2259 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2260 REGEX_VERBOSE_TEXT(&re);
2261 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2262 REGEX_CHECK_STATUS;
2263
2264 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2265 // 012345678901234567
2266 UText dataText = UTEXT_INITIALIZER;
2267 utext_openUTF8(&dataText, data, -1, &status);
2268 REGEX_CHECK_STATUS;
2269 REGEX_VERBOSE_TEXT(&dataText);
2270 RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2271
2272 //
2273 // Plain vanilla matches.
2274 //
2275 UnicodeString dest;
2276 UText destText = UTEXT_INITIALIZER;
2277 utext_openUnicodeString(&destText, &dest, &status);
2278 UText *result;
2279
2280 UText replText = UTEXT_INITIALIZER;
2281
2282 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2283 utext_openUTF8(&replText, str_yz, -1, &status);
2284 REGEX_VERBOSE_TEXT(&replText);
2285 result = matcher->replaceFirst(&replText, NULL, status);
2286 REGEX_CHECK_STATUS;
2287 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2288 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2289 utext_close(result);
2290 result = matcher->replaceFirst(&replText, &destText, status);
2291 REGEX_CHECK_STATUS;
2292 REGEX_ASSERT(result == &destText);
2293 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2294
2295 result = matcher->replaceAll(&replText, NULL, status);
2296 REGEX_CHECK_STATUS;
2297 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2298 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2299 utext_close(result);
2300
2301 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2302 result = matcher->replaceAll(&replText, &destText, status);
2303 REGEX_CHECK_STATUS;
2304 REGEX_ASSERT(result == &destText);
2305 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2306
2307 //
2308 // Plain vanilla non-matches.
2309 //
2310 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2311 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2312 matcher->reset(&dataText);
2313
2314 result = matcher->replaceFirst(&replText, NULL, status);
2315 REGEX_CHECK_STATUS;
2316 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2317 utext_close(result);
2318 result = matcher->replaceFirst(&replText, &destText, status);
2319 REGEX_CHECK_STATUS;
2320 REGEX_ASSERT(result == &destText);
2321 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2322
2323 result = matcher->replaceAll(&replText, NULL, status);
2324 REGEX_CHECK_STATUS;
2325 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2326 utext_close(result);
2327 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2328 result = matcher->replaceAll(&replText, &destText, status);
2329 REGEX_CHECK_STATUS;
2330 REGEX_ASSERT(result == &destText);
2331 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2332
2333 //
2334 // Empty source string
2335 //
2336 utext_openUTF8(&dataText, NULL, 0, &status);
2337 matcher->reset(&dataText);
2338
2339 result = matcher->replaceFirst(&replText, NULL, status);
2340 REGEX_CHECK_STATUS;
2341 REGEX_ASSERT_UTEXT_UTF8("", result);
2342 utext_close(result);
2343 result = matcher->replaceFirst(&replText, &destText, status);
2344 REGEX_CHECK_STATUS;
2345 REGEX_ASSERT(result == &destText);
2346 REGEX_ASSERT_UTEXT_UTF8("", result);
2347
2348 result = matcher->replaceAll(&replText, NULL, status);
2349 REGEX_CHECK_STATUS;
2350 REGEX_ASSERT_UTEXT_UTF8("", result);
2351 utext_close(result);
2352 result = matcher->replaceAll(&replText, &destText, status);
2353 REGEX_CHECK_STATUS;
2354 REGEX_ASSERT(result == &destText);
2355 REGEX_ASSERT_UTEXT_UTF8("", result);
2356
2357 //
2358 // Empty substitution string
2359 //
2360 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2361 matcher->reset(&dataText);
2362
2363 utext_openUTF8(&replText, NULL, 0, &status);
2364 result = matcher->replaceFirst(&replText, NULL, status);
2365 REGEX_CHECK_STATUS;
2366 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2367 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2368 utext_close(result);
2369 result = matcher->replaceFirst(&replText, &destText, status);
2370 REGEX_CHECK_STATUS;
2371 REGEX_ASSERT(result == &destText);
2372 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2373
2374 result = matcher->replaceAll(&replText, NULL, status);
2375 REGEX_CHECK_STATUS;
2376 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2377 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2378 utext_close(result);
2379 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2380 result = matcher->replaceAll(&replText, &destText, status);
2381 REGEX_CHECK_STATUS;
2382 REGEX_ASSERT(result == &destText);
2383 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2384
2385 //
2386 // match whole string
2387 //
2388 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2389 utext_openUTF8(&dataText, str_abc, -1, &status);
2390 matcher->reset(&dataText);
2391
2392 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2393 utext_openUTF8(&replText, str_xyz, -1, &status);
2394 result = matcher->replaceFirst(&replText, NULL, status);
2395 REGEX_CHECK_STATUS;
2396 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2397 utext_close(result);
2398 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2399 result = matcher->replaceFirst(&replText, &destText, status);
2400 REGEX_CHECK_STATUS;
2401 REGEX_ASSERT(result == &destText);
2402 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2403
2404 result = matcher->replaceAll(&replText, NULL, status);
2405 REGEX_CHECK_STATUS;
2406 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2407 utext_close(result);
2408 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2409 result = matcher->replaceAll(&replText, &destText, status);
2410 REGEX_CHECK_STATUS;
2411 REGEX_ASSERT(result == &destText);
2412 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2413
2414 //
2415 // Capture Group, simple case
2416 //
2417 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2418 utext_openUTF8(&re, str_add, -1, &status);
2419 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2420 REGEX_CHECK_STATUS;
2421
2422 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2423 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2424 RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2425 REGEX_CHECK_STATUS;
2426
2427 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2428 utext_openUTF8(&replText, str_11, -1, &status);
2429 result = matcher2->replaceFirst(&replText, NULL, status);
2430 REGEX_CHECK_STATUS;
2431 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2432 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2433 utext_close(result);
2434 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2435 result = matcher2->replaceFirst(&replText, &destText, status);
2436 REGEX_CHECK_STATUS;
2437 REGEX_ASSERT(result == &destText);
2438 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2439
2440 regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &status);
2441 result = matcher2->replaceFirst(&replText, NULL, status);
2442 REGEX_CHECK_STATUS;
2443 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2444 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2445 utext_close(result);
2446 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2447 result = matcher2->replaceFirst(&replText, &destText, status);
2448 REGEX_CHECK_STATUS;
2449 REGEX_ASSERT(result == &destText);
2450 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2451
2452 const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2453 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2454 result = matcher2->replaceFirst(&replText, NULL, status);
2455 REGEX_CHECK_STATUS;
2456 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2457 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2458 utext_close(result);
2459 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2460 result = matcher2->replaceFirst(&replText, &destText, status);
2461 REGEX_CHECK_STATUS;
2462 REGEX_ASSERT(result == &destText);
2463 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2464
2465 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2466 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2467 // 012345678901234567890123456
2468 supplDigitChars[22] = 0xF0;
2469 supplDigitChars[23] = 0x9D;
2470 supplDigitChars[24] = 0x9F;
2471 supplDigitChars[25] = 0x8F;
2472 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2473
2474 result = matcher2->replaceFirst(&replText, NULL, status);
2475 REGEX_CHECK_STATUS;
2476 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2477 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2478 utext_close(result);
2479 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2480 result = matcher2->replaceFirst(&replText, &destText, status);
2481 REGEX_CHECK_STATUS;
2482 REGEX_ASSERT(result == &destText);
2483 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2484 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2485 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2486 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2487// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2488 utext_close(result);
2489 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2490 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2491 REGEX_ASSERT(result == &destText);
2492// REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2493
2494 //
2495 // Replacement String with \u hex escapes
2496 //
2497 {
2498 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2499 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2500 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2501 utext_openUTF8(&replText, str_u0043, -1, &status);
2502 matcher->reset(&dataText);
2503
2504 result = matcher->replaceAll(&replText, NULL, status);
2505 REGEX_CHECK_STATUS;
2506 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2507 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2508 utext_close(result);
2509 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2510 result = matcher->replaceAll(&replText, &destText, status);
2511 REGEX_CHECK_STATUS;
2512 REGEX_ASSERT(result == &destText);
2513 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2514 }
2515 {
2516 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2517 utext_openUTF8(&dataText, str_abc, -1, &status);
2518 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2519 utext_openUTF8(&replText, str_U00010000, -1, &status);
2520 matcher->reset(&dataText);
2521
2522 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2523 // 0123456789
2524 expected[2] = 0xF0;
2525 expected[3] = 0x90;
2526 expected[4] = 0x80;
2527 expected[5] = 0x80;
2528
2529 result = matcher->replaceAll(&replText, NULL, status);
2530 REGEX_CHECK_STATUS;
2531 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2532 utext_close(result);
2533 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2534 result = matcher->replaceAll(&replText, &destText, status);
2535 REGEX_CHECK_STATUS;
2536 REGEX_ASSERT(result == &destText);
2537 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2538 }
2539 // TODO: need more through testing of capture substitutions.
2540
2541 // Bug 4057
2542 //
2543 {
2544 status = U_ZERO_ERROR;
2545const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2546const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2547const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2548 utext_openUTF8(&re, str_ssee, -1, &status);
2549 utext_openUTF8(&dataText, str_blah, -1, &status);
2550 utext_openUTF8(&replText, str_ooh, -1, &status);
2551
2552 RegexMatcher m(&re, 0, status);
2553 REGEX_CHECK_STATUS;
2554
2555 UnicodeString result;
2556 UText resultText = UTEXT_INITIALIZER;
2557 utext_openUnicodeString(&resultText, &result, &status);
2558
2559 // Multiple finds do NOT bump up the previous appendReplacement postion.
2560 m.reset(&dataText);
2561 m.find();
2562 m.find();
2563 m.appendReplacement(&resultText, &replText, status);
2564 REGEX_CHECK_STATUS;
2565 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2566 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2567
2568 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2569 status = U_ZERO_ERROR;
2570 result.truncate(0);
2571 utext_openUnicodeString(&resultText, &result, &status);
2572 m.reset(10, status);
2573 m.find();
2574 m.find();
2575 m.appendReplacement(&resultText, &replText, status);
2576 REGEX_CHECK_STATUS;
2577 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2578 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2579
2580 // find() at interior of string, appendReplacement still starts at beginning.
2581 status = U_ZERO_ERROR;
2582 result.truncate(0);
2583 utext_openUnicodeString(&resultText, &result, &status);
2584 m.reset();
2585 m.find(10, status);
2586 m.find();
2587 m.appendReplacement(&resultText, &replText, status);
2588 REGEX_CHECK_STATUS;
2589 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2590 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2591
2592 m.appendTail(&resultText, status);
2593 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2594 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2595
2596 utext_close(&resultText);
2597 }
2598
2599 delete matcher2;
2600 delete pat2;
2601 delete matcher;
2602 delete pat;
2603
2604 utext_close(&dataText);
2605 utext_close(&replText);
2606 utext_close(&destText);
2607 utext_close(&re);
2608}
2609
2610
2611//---------------------------------------------------------------------------
2612//
2613// API_Pattern_UTF8 Test that the API for class RegexPattern is
2614// present and nominally working.
2615//
2616//---------------------------------------------------------------------------
2617void RegexTest::API_Pattern_UTF8() {
2618 RegexPattern pata; // Test default constructor to not crash.
2619 RegexPattern patb;
2620
2621 REGEX_ASSERT(pata == patb);
2622 REGEX_ASSERT(pata == pata);
2623
2624 UText re1 = UTEXT_INITIALIZER;
2625 UText re2 = UTEXT_INITIALIZER;
2626 UErrorCode status = U_ZERO_ERROR;
2627 UParseError pe;
2628
2629 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2630 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2631 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2632 utext_openUTF8(&re2, str_def, -1, &status);
2633
2634 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2635 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2636 REGEX_CHECK_STATUS;
2637 REGEX_ASSERT(*pat1 == *pat1);
2638 REGEX_ASSERT(*pat1 != pata);
2639
2640 // Assign
2641 patb = *pat1;
2642 REGEX_ASSERT(patb == *pat1);
2643
2644 // Copy Construct
2645 RegexPattern patc(*pat1);
2646 REGEX_ASSERT(patc == *pat1);
2647 REGEX_ASSERT(patb == patc);
2648 REGEX_ASSERT(pat1 != pat2);
2649 patb = *pat2;
2650 REGEX_ASSERT(patb != patc);
2651 REGEX_ASSERT(patb == *pat2);
2652
2653 // Compile with no flags.
2654 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2655 REGEX_ASSERT(*pat1a == *pat1);
2656
2657 REGEX_ASSERT(pat1a->flags() == 0);
2658
2659 // Compile with different flags should be not equal
2660 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2661 REGEX_CHECK_STATUS;
2662
2663 REGEX_ASSERT(*pat1b != *pat1a);
2664 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2665 REGEX_ASSERT(pat1a->flags() == 0);
2666 delete pat1b;
2667
2668 // clone
2669 RegexPattern *pat1c = pat1->clone();
2670 REGEX_ASSERT(*pat1c == *pat1);
2671 REGEX_ASSERT(*pat1c != *pat2);
2672
2673 delete pat1c;
2674 delete pat1a;
2675 delete pat1;
2676 delete pat2;
2677
2678 utext_close(&re1);
2679 utext_close(&re2);
2680
2681
2682 //
2683 // Verify that a matcher created from a cloned pattern works.
2684 // (Jitterbug 3423)
2685 //
2686 {
2687 UErrorCode status = U_ZERO_ERROR;
2688 UText pattern = UTEXT_INITIALIZER;
2689 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2690 utext_openUTF8(&pattern, str_pL, -1, &status);
2691
2692 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2693 RegexPattern *pClone = pSource->clone();
2694 delete pSource;
2695 RegexMatcher *mFromClone = pClone->matcher(status);
2696 REGEX_CHECK_STATUS;
2697
2698 UText input = UTEXT_INITIALIZER;
2699 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2700 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2701 mFromClone->reset(&input);
2702 REGEX_ASSERT(mFromClone->find() == TRUE);
2703 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2704 REGEX_ASSERT(mFromClone->find() == TRUE);
2705 REGEX_ASSERT(mFromClone->group(status) == "World");
2706 REGEX_ASSERT(mFromClone->find() == FALSE);
2707 delete mFromClone;
2708 delete pClone;
2709
2710 utext_close(&input);
2711 utext_close(&pattern);
2712 }
2713
2714 //
2715 // matches convenience API
2716 //
2717 {
2718 UErrorCode status = U_ZERO_ERROR;
2719 UText pattern = UTEXT_INITIALIZER;
2720 UText input = UTEXT_INITIALIZER;
2721
2722 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2723 utext_openUTF8(&input, str_randominput, -1, &status);
2724
2725 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2726 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2727 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2728 REGEX_CHECK_STATUS;
2729
2730 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2731 utext_openUTF8(&pattern, str_abc, -1, &status);
2732 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2733 REGEX_CHECK_STATUS;
2734
2735 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2736 utext_openUTF8(&pattern, str_nput, -1, &status);
2737 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2738 REGEX_CHECK_STATUS;
2739
2740 utext_openUTF8(&pattern, str_randominput, -1, &status);
2741 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2742 REGEX_CHECK_STATUS;
2743
2744 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2745 utext_openUTF8(&pattern, str_u, -1, &status);
2746 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2747 REGEX_CHECK_STATUS;
2748
2749 utext_openUTF8(&input, str_abc, -1, &status);
2750 utext_openUTF8(&pattern, str_abc, -1, &status);
2751 status = U_INDEX_OUTOFBOUNDS_ERROR;
2752 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2753 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2754
2755 utext_close(&input);
2756 utext_close(&pattern);
2757 }
2758
2759
2760 //
2761 // Split()
2762 //
2763 status = U_ZERO_ERROR;
2764 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2765 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2766 pat1 = RegexPattern::compile(&re1, pe, status);
2767 REGEX_CHECK_STATUS;
2768 UnicodeString fields[10];
2769
2770 int32_t n;
2771 n = pat1->split("Now is the time", fields, 10, status);
2772 REGEX_CHECK_STATUS;
2773 REGEX_ASSERT(n==4);
2774 REGEX_ASSERT(fields[0]=="Now");
2775 REGEX_ASSERT(fields[1]=="is");
2776 REGEX_ASSERT(fields[2]=="the");
2777 REGEX_ASSERT(fields[3]=="time");
2778 REGEX_ASSERT(fields[4]=="");
2779
2780 n = pat1->split("Now is the time", fields, 2, status);
2781 REGEX_CHECK_STATUS;
2782 REGEX_ASSERT(n==2);
2783 REGEX_ASSERT(fields[0]=="Now");
2784 REGEX_ASSERT(fields[1]=="is the time");
2785 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2786
2787 fields[1] = "*";
2788 status = U_ZERO_ERROR;
2789 n = pat1->split("Now is the time", fields, 1, status);
2790 REGEX_CHECK_STATUS;
2791 REGEX_ASSERT(n==1);
2792 REGEX_ASSERT(fields[0]=="Now is the time");
2793 REGEX_ASSERT(fields[1]=="*");
2794 status = U_ZERO_ERROR;
2795
2796 n = pat1->split(" Now is the time ", fields, 10, status);
2797 REGEX_CHECK_STATUS;
2798 REGEX_ASSERT(n==5);
2799 REGEX_ASSERT(fields[0]=="");
2800 REGEX_ASSERT(fields[1]=="Now");
2801 REGEX_ASSERT(fields[2]=="is");
2802 REGEX_ASSERT(fields[3]=="the");
2803 REGEX_ASSERT(fields[4]=="time");
2804 REGEX_ASSERT(fields[5]=="");
2805
2806 n = pat1->split(" ", fields, 10, status);
2807 REGEX_CHECK_STATUS;
2808 REGEX_ASSERT(n==1);
2809 REGEX_ASSERT(fields[0]=="");
2810
2811 fields[0] = "foo";
2812 n = pat1->split("", fields, 10, status);
2813 REGEX_CHECK_STATUS;
2814 REGEX_ASSERT(n==0);
2815 REGEX_ASSERT(fields[0]=="foo");
2816
2817 delete pat1;
2818
2819 // split, with a pattern with (capture)
2820 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2821 pat1 = RegexPattern::compile(&re1, pe, status);
2822 REGEX_CHECK_STATUS;
2823
2824 status = U_ZERO_ERROR;
2825 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2826 REGEX_CHECK_STATUS;
2827 REGEX_ASSERT(n==6);
2828 REGEX_ASSERT(fields[0]=="");
2829 REGEX_ASSERT(fields[1]=="a");
2830 REGEX_ASSERT(fields[2]=="Now is ");
2831 REGEX_ASSERT(fields[3]=="b");
2832 REGEX_ASSERT(fields[4]=="the time");
2833 REGEX_ASSERT(fields[5]=="c");
2834 REGEX_ASSERT(fields[6]=="");
2835 REGEX_ASSERT(status==U_ZERO_ERROR);
2836
2837 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
2838 REGEX_CHECK_STATUS;
2839 REGEX_ASSERT(n==6);
2840 REGEX_ASSERT(fields[0]==" ");
2841 REGEX_ASSERT(fields[1]=="a");
2842 REGEX_ASSERT(fields[2]=="Now is ");
2843 REGEX_ASSERT(fields[3]=="b");
2844 REGEX_ASSERT(fields[4]=="the time");
2845 REGEX_ASSERT(fields[5]=="c");
2846 REGEX_ASSERT(fields[6]=="");
2847
2848 status = U_ZERO_ERROR;
2849 fields[6] = "foo";
2850 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
2851 REGEX_CHECK_STATUS;
2852 REGEX_ASSERT(n==6);
2853 REGEX_ASSERT(fields[0]==" ");
2854 REGEX_ASSERT(fields[1]=="a");
2855 REGEX_ASSERT(fields[2]=="Now is ");
2856 REGEX_ASSERT(fields[3]=="b");
2857 REGEX_ASSERT(fields[4]=="the time");
2858 REGEX_ASSERT(fields[5]=="c");
2859 REGEX_ASSERT(fields[6]=="foo");
2860
2861 status = U_ZERO_ERROR;
2862 fields[5] = "foo";
2863 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
2864 REGEX_CHECK_STATUS;
2865 REGEX_ASSERT(n==5);
2866 REGEX_ASSERT(fields[0]==" ");
2867 REGEX_ASSERT(fields[1]=="a");
2868 REGEX_ASSERT(fields[2]=="Now is ");
2869 REGEX_ASSERT(fields[3]=="b");
2870 REGEX_ASSERT(fields[4]=="the time<c>");
2871 REGEX_ASSERT(fields[5]=="foo");
2872
2873 status = U_ZERO_ERROR;
2874 fields[5] = "foo";
2875 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
2876 REGEX_CHECK_STATUS;
2877 REGEX_ASSERT(n==5);
2878 REGEX_ASSERT(fields[0]==" ");
2879 REGEX_ASSERT(fields[1]=="a");
2880 REGEX_ASSERT(fields[2]=="Now is ");
2881 REGEX_ASSERT(fields[3]=="b");
2882 REGEX_ASSERT(fields[4]=="the time");
2883 REGEX_ASSERT(fields[5]=="foo");
2884
2885 status = U_ZERO_ERROR;
2886 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
2887 REGEX_CHECK_STATUS;
2888 REGEX_ASSERT(n==4);
2889 REGEX_ASSERT(fields[0]==" ");
2890 REGEX_ASSERT(fields[1]=="a");
2891 REGEX_ASSERT(fields[2]=="Now is ");
2892 REGEX_ASSERT(fields[3]=="the time<c>");
2893 status = U_ZERO_ERROR;
2894 delete pat1;
2895
2896 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
2897 pat1 = RegexPattern::compile(&re1, pe, status);
2898 REGEX_CHECK_STATUS;
2899 n = pat1->split("1-10,20", fields, 10, status);
2900 REGEX_CHECK_STATUS;
2901 REGEX_ASSERT(n==5);
2902 REGEX_ASSERT(fields[0]=="1");
2903 REGEX_ASSERT(fields[1]=="-");
2904 REGEX_ASSERT(fields[2]=="10");
2905 REGEX_ASSERT(fields[3]==",");
2906 REGEX_ASSERT(fields[4]=="20");
2907 delete pat1;
2908
2909
2910 //
2911 // RegexPattern::pattern() and patternText()
2912 //
2913 pat1 = new RegexPattern();
2914 REGEX_ASSERT(pat1->pattern() == "");
2915 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
2916 delete pat1;
2917
2918 regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);
2919 pat1 = RegexPattern::compile(&re1, pe, status);
2920 REGEX_CHECK_STATUS;
2921 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2922 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
2923 delete pat1;
2924
2925 utext_close(&re1);
2926}
2927
2928
2929//---------------------------------------------------------------------------
2930//
2931// Extended A more thorough check for features of regex patterns
2932// The test cases are in a separate data file,
2933// source/tests/testdata/regextst.txt
2934// A description of the test data format is included in that file.
2935//
2936//---------------------------------------------------------------------------
2937
2938const char *
2939RegexTest::getPath(char buffer[2048], const char *filename) {
2940 UErrorCode status=U_ZERO_ERROR;
2941 const char *testDataDirectory = IntlTest::getSourceTestData(status);
2942 if (U_FAILURE(status)) {
2943 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2944 return NULL;
2945 }
2946
2947 strcpy(buffer, testDataDirectory);
2948 strcat(buffer, filename);
2949 return buffer;
2950}
2951
2952void RegexTest::Extended() {
2953 char tdd[2048];
2954 const char *srcPath;
2955 UErrorCode status = U_ZERO_ERROR;
2956 int32_t lineNum = 0;
2957
2958 //
2959 // Open and read the test data file.
2960 //
2961 srcPath=getPath(tdd, "regextst.txt");
2962 if(srcPath==NULL) {
2963 return; /* something went wrong, error already output */
2964 }
2965
2966 int32_t len;
2967 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2968 if (U_FAILURE(status)) {
2969 return; /* something went wrong, error already output */
2970 }
2971
2972 //
2973 // Put the test data into a UnicodeString
2974 //
2975 UnicodeString testString(FALSE, testData, len);
2976
2977 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2978 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2979 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2980
2981 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2982 UnicodeString testPattern; // The pattern for test from the test file.
2983 UnicodeString testFlags; // the flags for a test.
2984 UnicodeString matchString; // The marked up string to be used as input
2985
2986 if (U_FAILURE(status)){
2987 dataerrln("Construct RegexMatcher() error.");
2988 delete [] testData;
2989 return;
2990 }
2991
2992 //
2993 // Loop over the test data file, once per line.
2994 //
2995 while (lineMat.find()) {
2996 lineNum++;
2997 if (U_FAILURE(status)) {
2998 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
2999 }
3000
3001 status = U_ZERO_ERROR;
3002 UnicodeString testLine = lineMat.group(1, status);
3003 if (testLine.length() == 0) {
3004 continue;
3005 }
3006
3007 //
3008 // Parse the test line. Skip blank and comment only lines.
3009 // Separate out the three main fields - pattern, flags, target.
3010 //
3011
3012 commentMat.reset(testLine);
3013 if (commentMat.lookingAt(status)) {
3014 // This line is a comment, or blank.
3015 continue;
3016 }
3017
3018 //
3019 // Pull out the pattern field, remove it from the test file line.
3020 //
3021 quotedStuffMat.reset(testLine);
3022 if (quotedStuffMat.lookingAt(status)) {
3023 testPattern = quotedStuffMat.group(2, status);
3024 testLine.remove(0, quotedStuffMat.end(0, status));
3025 } else {
3026 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3027 continue;
3028 }
3029
3030
3031 //
3032 // Pull out the flags from the test file line.
3033 //
3034 flagsMat.reset(testLine);
3035 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3036 testFlags = flagsMat.group(1, status);
3037 if (flagsMat.group(2, status).length() > 0) {
3038 errln("Bad Match flag at line %d. Scanning %c\n",
b75a7d8f
A
3039 lineNum, flagsMat.group(2, status).charAt(0));
3040 continue;
3041 }
729e4ab9
A
3042 testLine.remove(0, flagsMat.end(0, status));
3043
3044 //
3045 // Pull out the match string, as a whole.
3046 // We'll process the <tags> later.
3047 //
3048 quotedStuffMat.reset(testLine);
3049 if (quotedStuffMat.lookingAt(status)) {
3050 matchString = quotedStuffMat.group(2, status);
3051 testLine.remove(0, quotedStuffMat.end(0, status));
3052 } else {
3053 errln("Bad match string at test file line %d", lineNum);
3054 continue;
3055 }
3056
3057 //
3058 // The only thing left from the input line should be an optional trailing comment.
3059 //
3060 commentMat.reset(testLine);
3061 if (commentMat.lookingAt(status) == FALSE) {
3062 errln("Line %d: unexpected characters at end of test line.", lineNum);
3063 continue;
3064 }
3065
3066 //
3067 // Run the test
3068 //
3069 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3070 }
3071
3072 delete [] testData;
3073
3074}
3075
3076
3077
3078//---------------------------------------------------------------------------
3079//
3080// regex_find(pattern, flags, inputString, lineNumber)
3081//
3082// Function to run a single test from the Extended (data driven) tests.
3083// See file test/testdata/regextst.txt for a description of the
3084// pattern and inputString fields, and the allowed flags.
3085// lineNumber is the source line in regextst.txt of the test.
3086//
3087//---------------------------------------------------------------------------
3088
3089
3090// Set a value into a UVector at position specified by a decimal number in
3091// a UnicodeString. This is a utility function needed by the actual test function,
3092// which follows.
3093static void set(UVector &vec, int32_t val, UnicodeString index) {
3094 UErrorCode status=U_ZERO_ERROR;
3095 int32_t idx = 0;
3096 for (int32_t i=0; i<index.length(); i++) {
3097 int32_t d=u_charDigitValue(index.charAt(i));
3098 if (d<0) {return;}
3099 idx = idx*10 + d;
3100 }
3101 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3102 vec.setElementAt(val, idx);
3103}
3104
3105static void setInt(UVector &vec, int32_t val, int32_t idx) {
3106 UErrorCode status=U_ZERO_ERROR;
3107 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3108 vec.setElementAt(val, idx);
3109}
3110
3111static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3112{
3113 UBool couldFind = TRUE;
3114 UTEXT_SETNATIVEINDEX(utext, 0);
3115 int32_t i = 0;
3116 while (i < unistrOffset) {
3117 UChar32 c = UTEXT_NEXT32(utext);
3118 if (c != U_SENTINEL) {
3119 i += U16_LENGTH(c);
3120 } else {
3121 couldFind = FALSE;
3122 break;
3123 }
3124 }
3125 nativeIndex = UTEXT_GETNATIVEINDEX(utext);
3126 return couldFind;
3127}
3128
3129
3130void RegexTest::regex_find(const UnicodeString &pattern,
3131 const UnicodeString &flags,
3132 const UnicodeString &inputString,
3133 const char *srcPath,
3134 int32_t line) {
3135 UnicodeString unEscapedInput;
3136 UnicodeString deTaggedInput;
3137
3138 int32_t patternUTF8Length, inputUTF8Length;
3139 char *patternChars = NULL, *inputChars = NULL;
3140 UText patternText = UTEXT_INITIALIZER;
3141 UText inputText = UTEXT_INITIALIZER;
3142 UConverter *UTF8Converter = NULL;
3143
3144 UErrorCode status = U_ZERO_ERROR;
3145 UParseError pe;
3146 RegexPattern *parsePat = NULL;
3147 RegexMatcher *parseMatcher = NULL;
3148 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3149 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3150 UVector groupStarts(status);
3151 UVector groupEnds(status);
3152 UVector groupStartsUTF8(status);
3153 UVector groupEndsUTF8(status);
3154 UBool isMatch = FALSE, isUTF8Match = FALSE;
3155 UBool failed = FALSE;
3156 int32_t numFinds;
3157 int32_t i;
3158 UBool useMatchesFunc = FALSE;
3159 UBool useLookingAtFunc = FALSE;
3160 int32_t regionStart = -1;
3161 int32_t regionEnd = -1;
3162 int32_t regionStartUTF8 = -1;
3163 int32_t regionEndUTF8 = -1;
3164
3165
3166 //
3167 // Compile the caller's pattern
3168 //
3169 uint32_t bflags = 0;
3170 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3171 bflags |= UREGEX_CASE_INSENSITIVE;
3172 }
3173 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3174 bflags |= UREGEX_COMMENTS;
3175 }
3176 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3177 bflags |= UREGEX_DOTALL;
3178 }
3179 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3180 bflags |= UREGEX_MULTILINE;
3181 }
3182
3183 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3184 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3185 }
3186 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3187 bflags |= UREGEX_UNIX_LINES;
3188 }
3189
3190
3191 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3192 if (status != U_ZERO_ERROR) {
3193 #if UCONFIG_NO_BREAK_ITERATION==1
3194 // 'v' test flag means that the test pattern should not compile if ICU was configured
3195 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3196 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3197 goto cleanupAndReturn;
3198 }
3199 #endif
3200 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3201 // Expected pattern compilation error.
3202 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3203 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3204 }
3205 goto cleanupAndReturn;
3206 } else {
3207 // Unexpected pattern compilation error.
3208 errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3209 goto cleanupAndReturn;
3210 }
3211 }
3212
3213 UTF8Converter = ucnv_open("UTF8", &status);
3214 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3215
3216 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3217 status = U_ZERO_ERROR; // buffer overflow
3218 patternChars = new char[patternUTF8Length+1];
3219 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3220 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3221
3222 if (status == U_ZERO_ERROR) {
3223 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3224
3225 if (status != U_ZERO_ERROR) {
3226#if UCONFIG_NO_BREAK_ITERATION==1
3227 // 'v' test flag means that the test pattern should not compile if ICU was configured
3228 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3229 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3230 goto cleanupAndReturn;
3231 }
3232#endif
3233 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3234 // Expected pattern compilation error.
3235 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3236 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3237 }
3238 goto cleanupAndReturn;
3239 } else {
3240 // Unexpected pattern compilation error.
3241 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3242 goto cleanupAndReturn;
3243 }
3244 }
3245 }
3246
3247 if (UTF8Pattern == NULL) {
3248 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3249 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3250 status = U_ZERO_ERROR;
3251 }
3252
3253 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3254 RegexPatternDump(callerPattern);
3255 }
3256
3257 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3258 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3259 goto cleanupAndReturn;
3260 }
3261
3262
3263 //
3264 // Number of times find() should be called on the test string, default to 1
3265 //
3266 numFinds = 1;
3267 for (i=2; i<=9; i++) {
3268 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3269 if (numFinds != 1) {
3270 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3271 goto cleanupAndReturn;
3272 }
3273 numFinds = i;
3274 }
3275 }
3276
3277 // 'M' flag. Use matches() instead of find()
3278 if (flags.indexOf((UChar)0x4d) >= 0) {
3279 useMatchesFunc = TRUE;
3280 }
3281 if (flags.indexOf((UChar)0x4c) >= 0) {
3282 useLookingAtFunc = TRUE;
3283 }
3284
3285 //
3286 // Find the tags in the input data, remove them, and record the group boundary
3287 // positions.
3288 //
3289 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3290 REGEX_CHECK_STATUS_L(line);
3291
3292 unEscapedInput = inputString.unescape();
3293 parseMatcher = parsePat->matcher(unEscapedInput, status);
3294 REGEX_CHECK_STATUS_L(line);
3295 while(parseMatcher->find()) {
3296 parseMatcher->appendReplacement(deTaggedInput, "", status);
3297 REGEX_CHECK_STATUS;
3298 UnicodeString groupNum = parseMatcher->group(2, status);
3299 if (groupNum == "r") {
3300 // <r> or </r>, a region specification within the string
3301 if (parseMatcher->group(1, status) == "/") {
3302 regionEnd = deTaggedInput.length();
3303 } else {
3304 regionStart = deTaggedInput.length();
3305 }
3306 } else {
3307 // <digits> or </digits>, a group match boundary tag.
3308 if (parseMatcher->group(1, status) == "/") {
3309 set(groupEnds, deTaggedInput.length(), groupNum);
3310 } else {
3311 set(groupStarts, deTaggedInput.length(), groupNum);
3312 }
3313 }
3314 }
3315 parseMatcher->appendTail(deTaggedInput);
3316 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3317 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3318 errln("mismatched <r> tags");
3319 failed = TRUE;
3320 goto cleanupAndReturn;
3321 }
b75a7d8f 3322
729e4ab9
A
3323 //
3324 // Configure the matcher according to the flags specified with this test.
3325 //
3326 matcher = callerPattern->matcher(deTaggedInput, status);
3327 REGEX_CHECK_STATUS_L(line);
3328 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3329 matcher->setTrace(TRUE);
3330 }
3331
3332 if (UTF8Pattern != NULL) {
3333 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3334 status = U_ZERO_ERROR; // buffer overflow
3335 inputChars = new char[inputUTF8Length+1];
3336 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3337 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3338
3339 if (status == U_ZERO_ERROR) {
3340 UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3341 REGEX_CHECK_STATUS_L(line);
3342 }
3343
3344 if (UTF8Matcher == NULL) {
3345 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3346 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3347 status = U_ZERO_ERROR;
3348 }
3349 }
3350
3351 //
3352 // Generate native indices for UTF8 versions of region and capture group info
3353 //
3354 if (UTF8Matcher != NULL) {
3355 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3356 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3357
3358 // Fill out the native index UVector info.
3359 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3360 for (i=0; i<groupStarts.size(); i++) {
3361 int32_t start = groupStarts.elementAti(i);
3362 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3363 if (start >= 0) {
3364 int32_t startUTF8;
3365 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3366 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3367 failed = TRUE;
3368 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3369 }
3370 setInt(groupStartsUTF8, startUTF8, i);
3371 }
3372
3373 int32_t end = groupEnds.elementAti(i);
3374 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3375 if (end >= 0) {
3376 int32_t endUTF8;
3377 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3378 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3379 failed = TRUE;
3380 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3381 }
3382 setInt(groupEndsUTF8, endUTF8, i);
3383 }
3384 }
3385 }
3386
3387 if (regionStart>=0) {
3388 matcher->region(regionStart, regionEnd, status);
3389 REGEX_CHECK_STATUS_L(line);
3390 if (UTF8Matcher != NULL) {
3391 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3392 REGEX_CHECK_STATUS_L(line);
3393 }
3394 }
3395 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3396 matcher->useAnchoringBounds(FALSE);
3397 if (UTF8Matcher != NULL) {
3398 UTF8Matcher->useAnchoringBounds(FALSE);
3399 }
3400 }
3401 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3402 matcher->useTransparentBounds(TRUE);
3403 if (UTF8Matcher != NULL) {
3404 UTF8Matcher->useTransparentBounds(TRUE);
3405 }
3406 }
3407
3408
3409
3410 //
3411 // Do a find on the de-tagged input using the caller's pattern
3412 // TODO: error on count>1 and not find().
3413 // error on both matches() and lookingAt().
3414 //
3415 for (i=0; i<numFinds; i++) {
3416 if (useMatchesFunc) {
3417 isMatch = matcher->matches(status);
3418 if (UTF8Matcher != NULL) {
3419 isUTF8Match = UTF8Matcher->matches(status);
3420 }
3421 } else if (useLookingAtFunc) {
3422 isMatch = matcher->lookingAt(status);
3423 if (UTF8Matcher != NULL) {
3424 isUTF8Match = UTF8Matcher->lookingAt(status);
3425 }
b75a7d8f 3426 } else {
729e4ab9
A
3427 isMatch = matcher->find();
3428 if (UTF8Matcher != NULL) {
3429 isUTF8Match = UTF8Matcher->find();
3430 }
b75a7d8f 3431 }
729e4ab9
A
3432 }
3433 matcher->setTrace(FALSE);
b75a7d8f 3434
729e4ab9
A
3435 //
3436 // Match up the groups from the find() with the groups from the tags
3437 //
3438
3439 // number of tags should match number of groups from find operation.
3440 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3441 // G option in test means that capture group data is not available in the
3442 // expected results, so the check needs to be suppressed.
3443 if (isMatch == FALSE && groupStarts.size() != 0) {
3444 errln("Error at line %d: Match expected, but none found.", line);
3445 failed = TRUE;
3446 goto cleanupAndReturn;
3447 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3448 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3449 failed = TRUE;
3450 goto cleanupAndReturn;
3451 }
3452
3453 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3454 // Only check for match / no match. Don't check capture groups.
3455 if (isMatch && groupStarts.size() == 0) {
3456 errln("Error at line %d: No match expected, but one found.", line);
3457 failed = TRUE;
3458 } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3459 errln("Error at line %d: No match expected, but one found. (UTF8)", line);
3460 failed = TRUE;
3461 }
3462 goto cleanupAndReturn;
3463 }
3464
3465 REGEX_CHECK_STATUS_L(line);
3466 for (i=0; i<=matcher->groupCount(); i++) {
3467 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3468 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3469 if (matcher->start(i, status) != expectedStart) {
3470 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3471 line, i, expectedStart, matcher->start(i, status));
3472 failed = TRUE;
3473 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3474 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3475 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3476 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3477 failed = TRUE;
3478 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3479 }
3480
3481 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3482 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3483 if (matcher->end(i, status) != expectedEnd) {
3484 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3485 line, i, expectedEnd, matcher->end(i, status));
3486 failed = TRUE;
3487 // Error on end position; keep going; real error is probably yet to come as group
3488 // end positions work from end of the input data towards the front.
3489 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3490 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3491 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3492 failed = TRUE;
3493 // Error on end position; keep going; real error is probably yet to come as group
3494 // end positions work from end of the input data towards the front.
3495 }
3496 }
3497 if ( matcher->groupCount()+1 < groupStarts.size()) {
3498 errln("Error at line %d: Expected %d capture groups, found %d.",
3499 line, groupStarts.size()-1, matcher->groupCount());
3500 failed = TRUE;
b75a7d8f 3501 }
729e4ab9
A
3502 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3503 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3504 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3505 failed = TRUE;
3506 }
3507
3508 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3509 matcher->requireEnd() == TRUE) {
3510 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3511 failed = TRUE;
3512 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3513 UTF8Matcher->requireEnd() == TRUE) {
3514 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3515 failed = TRUE;
3516 }
3517
3518 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3519 matcher->requireEnd() == FALSE) {
3520 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3521 failed = TRUE;
3522 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3523 UTF8Matcher->requireEnd() == FALSE) {
3524 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3525 failed = TRUE;
3526 }
3527
3528 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3529 matcher->hitEnd() == TRUE) {
3530 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3531 failed = TRUE;
3532 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3533 UTF8Matcher->hitEnd() == TRUE) {
3534 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3535 failed = TRUE;
3536 }
3537
3538 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3539 matcher->hitEnd() == FALSE) {
3540 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3541 failed = TRUE;
3542 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3543 UTF8Matcher->hitEnd() == FALSE) {
3544 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3545 failed = TRUE;
3546 }
3547
3548
3549cleanupAndReturn:
3550 if (failed) {
3551 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3552 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3553 // callerPattern->dump();
3554 }
3555 delete parseMatcher;
3556 delete parsePat;
3557 delete UTF8Matcher;
3558 delete UTF8Pattern;
3559 delete matcher;
3560 delete callerPattern;
3561
3562 utext_close(&inputText);
3563 delete[] inputChars;
3564 utext_close(&patternText);
3565 delete[] patternChars;
3566 ucnv_close(UTF8Converter);
3567}
3568
3569
3570
3571
3572//---------------------------------------------------------------------------
3573//
3574// Errors Check for error handling in patterns.
3575//
3576//---------------------------------------------------------------------------
3577void RegexTest::Errors() {
3578 // \escape sequences that aren't implemented yet.
3579 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3580
3581 // Missing close parentheses
3582 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3583 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3584 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3585
3586 // Extra close paren
3587 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3588 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3589 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3590
3591 // Look-ahead, Look-behind
3592 // TODO: add tests for unbounded length look-behinds.
3593 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3594
3595 // Attempt to use non-default flags
3596 {
3597 UParseError pe;
3598 UErrorCode status = U_ZERO_ERROR;
3599 int32_t flags = UREGEX_CANON_EQ |
3600 UREGEX_COMMENTS | UREGEX_DOTALL |
3601 UREGEX_MULTILINE;
3602 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3603 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3604 delete pat1;
3605 }
3606
3607
3608 // Quantifiers are allowed only after something that can be quantified.
3609 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3610 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3611 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3612
3613 // Mal-formed {min,max} quantifiers
3614 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3615 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3616 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3617 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3618 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3619 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3620 REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3621 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3622 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
b75a7d8f 3623
729e4ab9
A
3624 // Ticket 5389
3625 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
b75a7d8f 3626
729e4ab9
A
3627 // Invalid Back Reference \0
3628 // For ICU 3.8 and earlier
3629 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3630 //
3631 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
b75a7d8f
A
3632
3633}
3634
3635
729e4ab9
A
3636//-------------------------------------------------------------------------------
3637//
3638// Read a text data file, convert it to UChars, and return the data
3639// in one big UChar * buffer, which the caller must delete.
46f4442e 3640//
729e4ab9
A
3641//--------------------------------------------------------------------------------
3642UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3643 const char *defEncoding, UErrorCode &status) {
3644 UChar *retPtr = NULL;
3645 char *fileBuf = NULL;
3646 UConverter* conv = NULL;
3647 FILE *f = NULL;
46f4442e 3648
729e4ab9
A
3649 ulen = 0;
3650 if (U_FAILURE(status)) {
3651 return retPtr;
46f4442e 3652 }
46f4442e
A
3653
3654 //
729e4ab9 3655 // Open the file.
46f4442e 3656 //
729e4ab9
A
3657 f = fopen(fileName, "rb");
3658 if (f == 0) {
3659 dataerrln("Error opening test data file %s\n", fileName);
3660 status = U_FILE_ACCESS_ERROR;
3661 return NULL;
46f4442e 3662 }
729e4ab9
A
3663 //
3664 // Read it in
3665 //
3666 int32_t fileSize;
3667 int32_t amt_read;
3668
3669 fseek( f, 0, SEEK_END);
3670 fileSize = ftell(f);
3671 fileBuf = new char[fileSize];
3672 fseek(f, 0, SEEK_SET);
3673 amt_read = fread(fileBuf, 1, fileSize, f);
3674 if (amt_read != fileSize || fileSize <= 0) {
3675 errln("Error reading test data file.");
3676 goto cleanUpAndReturn;
46f4442e
A
3677 }
3678
729e4ab9
A
3679 //
3680 // Look for a Unicode Signature (BOM) on the data just read
3681 //
3682 int32_t signatureLength;
3683 const char * fileBufC;
3684 const char* encoding;
46f4442e 3685
729e4ab9
A
3686 fileBufC = fileBuf;
3687 encoding = ucnv_detectUnicodeSignature(
3688 fileBuf, fileSize, &signatureLength, &status);
3689 if(encoding!=NULL ){
3690 fileBufC += signatureLength;
3691 fileSize -= signatureLength;
3692 } else {
3693 encoding = defEncoding;
3694 if (strcmp(encoding, "utf-8") == 0) {
3695 errln("file %s is missing its BOM", fileName);
46f4442e
A
3696 }
3697 }
3698
729e4ab9
A
3699 //
3700 // Open a converter to take the rule file to UTF-16
3701 //
3702 conv = ucnv_open(encoding, &status);
3703 if (U_FAILURE(status)) {
3704 goto cleanUpAndReturn;
46f4442e
A
3705 }
3706
729e4ab9
A
3707 //
3708 // Convert the rules to UChar.
3709 // Preflight first to determine required buffer size.
3710 //
3711 ulen = ucnv_toUChars(conv,
3712 NULL, // dest,
3713 0, // destCapacity,
3714 fileBufC,
3715 fileSize,
3716 &status);
3717 if (status == U_BUFFER_OVERFLOW_ERROR) {
3718 // Buffer Overflow is expected from the preflight operation.
3719 status = U_ZERO_ERROR;
3720
3721 retPtr = new UChar[ulen+1];
3722 ucnv_toUChars(conv,
3723 retPtr, // dest,
3724 ulen+1,
3725 fileBufC,
3726 fileSize,
3727 &status);
46f4442e
A
3728 }
3729
729e4ab9
A
3730cleanUpAndReturn:
3731 fclose(f);
3732 delete[] fileBuf;
3733 ucnv_close(conv);
3734 if (U_FAILURE(status)) {
3735 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3736 delete retPtr;
3737 retPtr = 0;
3738 ulen = 0;
3739 };
3740 return retPtr;
3741}
3742
3743
3744//-------------------------------------------------------------------------------
3745//
3746// PerlTests - Run Perl's regular expression tests
3747// The input file for this test is re_tests, the standard regular
3748// expression test data distributed with the Perl source code.
3749//
3750// Here is Perl's description of the test data file:
3751//
3752// # The tests are in a separate file 't/op/re_tests'.
3753// # Each line in that file is a separate test.
3754// # There are five columns, separated by tabs.
3755// #
3756// # Column 1 contains the pattern, optionally enclosed in C<''>.
3757// # Modifiers can be put after the closing C<'>.
3758// #
3759// # Column 2 contains the string to be matched.
3760// #
3761// # Column 3 contains the expected result:
3762// # y expect a match
3763// # n expect no match
3764// # c expect an error
3765// # B test exposes a known bug in Perl, should be skipped
3766// # b test exposes a known bug in Perl, should be skipped if noamp
3767// #
3768// # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3769// #
3770// # Column 4 contains a string, usually C<$&>.
3771// #
3772// # Column 5 contains the expected result of double-quote
3773// # interpolating that string after the match, or start of error message.
3774// #
3775// # Column 6, if present, contains a reason why the test is skipped.
3776// # This is printed with "skipped", for harness to pick up.
3777// #
3778// # \n in the tests are interpolated, as are variables of the form ${\w+}.
3779// #
3780// # If you want to add a regular expression test that can't be expressed
3781// # in this format, don't add it here: put it in op/pat.t instead.
3782//
3783// For ICU, if field 3 contains an 'i', the test will be skipped.
3784// The test exposes is some known incompatibility between ICU and Perl regexps.
3785// (The i is in addition to whatever was there before.)
3786//
3787//-------------------------------------------------------------------------------
3788void RegexTest::PerlTests() {
3789 char tdd[2048];
3790 const char *srcPath;
3791 UErrorCode status = U_ZERO_ERROR;
3792 UParseError pe;
46f4442e
A
3793
3794 //
729e4ab9 3795 // Open and read the test data file.
46f4442e 3796 //
729e4ab9
A
3797 srcPath=getPath(tdd, "re_tests.txt");
3798 if(srcPath==NULL) {
3799 return; /* something went wrong, error already output */
46f4442e 3800 }
729e4ab9
A
3801
3802 int32_t len;
3803 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3804 if (U_FAILURE(status)) {
3805 return; /* something went wrong, error already output */
46f4442e
A
3806 }
3807
3808 //
729e4ab9 3809 // Put the test data into a UnicodeString
46f4442e 3810 //
729e4ab9 3811 UnicodeString testDataString(FALSE, testData, len);
46f4442e 3812
729e4ab9
A
3813 //
3814 // Regex to break the input file into lines, and strip the new lines.
3815 // One line per match, capture group one is the desired data.
3816 //
3817 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3818 if (U_FAILURE(status)) {
3819 dataerrln("RegexPattern::compile() error");
3820 return;
46f4442e 3821 }
729e4ab9 3822 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
46f4442e 3823
729e4ab9
A
3824 //
3825 // Regex to split a test file line into fields.
3826 // There are six fields, separated by tabs.
3827 //
3828 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
46f4442e
A
3829
3830 //
729e4ab9
A
3831 // Regex to identify test patterns with flag settings, and to separate them.
3832 // Test patterns with flags look like 'pattern'i
3833 // Test patterns without flags are not quoted: pattern
3834 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
46f4442e 3835 //
729e4ab9
A
3836 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3837 RegexMatcher* flagMat = flagPat->matcher(status);
46f4442e
A
3838
3839 //
729e4ab9
A
3840 // The Perl tests reference several perl-isms, which are evaluated/substituted
3841 // in the test data. Not being perl, this must be done explicitly. Here
3842 // are string constants and REs for these constructs.
46f4442e 3843 //
729e4ab9
A
3844 UnicodeString nulnulSrc("${nulnul}");
3845 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3846 nulnul = nulnul.unescape();
3847
3848 UnicodeString ffffSrc("${ffff}");
3849 UnicodeString ffff("\\uffff", -1, US_INV);
3850 ffff = ffff.unescape();
3851
3852 // regexp for $-[0], $+[2], etc.
3853 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3854 RegexMatcher *groupsMat = groupsPat->matcher(status);
3855
3856 // regexp for $0, $1, $2, etc.
3857 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3858 RegexMatcher *cgMat = cgPat->matcher(status);
3859
46f4442e
A
3860
3861 //
729e4ab9
A
3862 // Main Loop for the Perl Tests, runs once per line from the
3863 // test data file.
46f4442e 3864 //
729e4ab9
A
3865 int32_t lineNum = 0;
3866 int32_t skippedUnimplementedCount = 0;
3867 while (lineMat->find()) {
3868 lineNum++;
46f4442e 3869
729e4ab9
A
3870 //
3871 // Get a line, break it into its fields, do the Perl
3872 // variable substitutions.
3873 //
3874 UnicodeString line = lineMat->group(1, status);
3875 UnicodeString fields[7];
3876 fieldPat->split(line, fields, 7, status);
46f4442e 3877
729e4ab9
A
3878 flagMat->reset(fields[0]);
3879 flagMat->matches(status);
3880 UnicodeString pattern = flagMat->group(2, status);
3881 pattern.findAndReplace("${bang}", "!");
3882 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3883 pattern.findAndReplace(ffffSrc, ffff);
3884
3885 //
3886 // Identify patterns that include match flag settings,
3887 // split off the flags, remove the extra quotes.
3888 //
3889 UnicodeString flagStr = flagMat->group(3, status);
3890 if (U_FAILURE(status)) {
3891 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3892 return;
3893 }
3894 int32_t flags = 0;
3895 const UChar UChar_c = 0x63; // Char constants for the flag letters.
3896 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
3897 const UChar UChar_m = 0x6d;
3898 const UChar UChar_x = 0x78;
3899 const UChar UChar_y = 0x79;
3900 if (flagStr.indexOf(UChar_i) != -1) {
3901 flags |= UREGEX_CASE_INSENSITIVE;
3902 }
3903 if (flagStr.indexOf(UChar_m) != -1) {
3904 flags |= UREGEX_MULTILINE;
3905 }
3906 if (flagStr.indexOf(UChar_x) != -1) {
3907 flags |= UREGEX_COMMENTS;
46f4442e 3908 }
46f4442e 3909
729e4ab9
A
3910 //
3911 // Compile the test pattern.
3912 //
3913 status = U_ZERO_ERROR;
3914 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3915 if (status == U_REGEX_UNIMPLEMENTED) {
3916 //
3917 // Test of a feature that is planned for ICU, but not yet implemented.
3918 // skip the test.
3919 skippedUnimplementedCount++;
3920 delete testPat;
3921 status = U_ZERO_ERROR;
3922 continue;
46f4442e 3923 }
729e4ab9
A
3924
3925 if (U_FAILURE(status)) {
3926 // Some tests are supposed to generate errors.
3927 // Only report an error for tests that are supposed to succeed.
3928 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
3929 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
3930 {
3931 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3932 }
3933 status = U_ZERO_ERROR;
3934 delete testPat;
3935 continue;
46f4442e 3936 }
729e4ab9
A
3937
3938 if (fields[2].indexOf(UChar_i) >= 0) {
3939 // ICU should skip this test.
3940 delete testPat;
3941 continue;
46f4442e
A
3942 }
3943
729e4ab9
A
3944 if (fields[2].indexOf(UChar_c) >= 0) {
3945 // This pattern should have caused a compilation error, but didn't/
3946 errln("line %d: Expected a pattern compile error, got success.", lineNum);
3947 delete testPat;
3948 continue;
3949 }
3950
3951 //
3952 // replace the Perl variables that appear in some of the
3953 // match data strings.
3954 //
3955 UnicodeString matchString = fields[1];
3956 matchString.findAndReplace(nulnulSrc, nulnul);
3957 matchString.findAndReplace(ffffSrc, ffff);
46f4442e 3958
729e4ab9
A
3959 // Replace any \n in the match string with an actual new-line char.
3960 // Don't do full unescape, as this unescapes more than Perl does, which
3961 // causes other spurious failures in the tests.
3962 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
46f4442e 3963
46f4442e
A
3964
3965
729e4ab9
A
3966 //
3967 // Run the test, check for expected match/don't match result.
3968 //
3969 RegexMatcher *testMat = testPat->matcher(matchString, status);
3970 UBool found = testMat->find();
3971 UBool expected = FALSE;
3972 if (fields[2].indexOf(UChar_y) >=0) {
3973 expected = TRUE;
3974 }
3975 if (expected != found) {
3976 errln("line %d: Expected %smatch, got %smatch",
3977 lineNum, expected?"":"no ", found?"":"no " );
3978 continue;
3979 }
3980
3981 // Don't try to check expected results if there is no match.
3982 // (Some have stuff in the expected fields)
3983 if (!found) {
3984 delete testMat;
3985 delete testPat;
3986 continue;
3987 }
46f4442e 3988
729e4ab9
A
3989 //
3990 // Interpret the Perl expression from the fourth field of the data file,
3991 // building up an ICU string from the results of the ICU match.
3992 // The Perl expression will contain references to the results of
3993 // a regex match, including the matched string, capture group strings,
3994 // group starting and ending indicies, etc.
3995 //
3996 UnicodeString resultString;
3997 UnicodeString perlExpr = fields[3];
3998#if SUPPORT_MUTATING_INPUT_STRING
3999 groupsMat->reset(perlExpr);
4000 cgMat->reset(perlExpr);
4001#endif
46f4442e 4002
729e4ab9
A
4003 while (perlExpr.length() > 0) {
4004#if !SUPPORT_MUTATING_INPUT_STRING
4005 // Perferred usage. Reset after any modification to input string.
4006 groupsMat->reset(perlExpr);
4007 cgMat->reset(perlExpr);
4008#endif
b75a7d8f 4009
729e4ab9
A
4010 if (perlExpr.startsWith("$&")) {
4011 resultString.append(testMat->group(status));
4012 perlExpr.remove(0, 2);
4013 }
b75a7d8f 4014
729e4ab9
A
4015 else if (groupsMat->lookingAt(status)) {
4016 // $-[0] $+[2] etc.
4017 UnicodeString digitString = groupsMat->group(2, status);
4018 int32_t t = 0;
4019 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4020 UnicodeString plusOrMinus = groupsMat->group(1, status);
4021 int32_t matchPosition;
4022 if (plusOrMinus.compare("+") == 0) {
4023 matchPosition = testMat->end(groupNum, status);
4024 } else {
4025 matchPosition = testMat->start(groupNum, status);
4026 }
4027 if (matchPosition != -1) {
4028 ICU_Utility::appendNumber(resultString, matchPosition);
4029 }
4030 perlExpr.remove(0, groupsMat->end(status));
4031 }
b75a7d8f 4032
729e4ab9
A
4033 else if (cgMat->lookingAt(status)) {
4034 // $1, $2, $3, etc.
4035 UnicodeString digitString = cgMat->group(1, status);
4036 int32_t t = 0;
4037 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4038 if (U_SUCCESS(status)) {
4039 resultString.append(testMat->group(groupNum, status));
4040 status = U_ZERO_ERROR;
4041 }
4042 perlExpr.remove(0, cgMat->end(status));
4043 }
b75a7d8f 4044
729e4ab9
A
4045 else if (perlExpr.startsWith("@-")) {
4046 int32_t i;
4047 for (i=0; i<=testMat->groupCount(); i++) {
4048 if (i>0) {
4049 resultString.append(" ");
4050 }
4051 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4052 }
4053 perlExpr.remove(0, 2);
4054 }
b75a7d8f 4055
729e4ab9
A
4056 else if (perlExpr.startsWith("@+")) {
4057 int32_t i;
4058 for (i=0; i<=testMat->groupCount(); i++) {
4059 if (i>0) {
4060 resultString.append(" ");
4061 }
4062 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4063 }
4064 perlExpr.remove(0, 2);
4065 }
b75a7d8f 4066
729e4ab9
A
4067 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4068 // or as an escaped sequence (e.g. \n)
4069 if (perlExpr.length() > 1) {
4070 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4071 }
4072 UChar c = perlExpr.charAt(0);
4073 switch (c) {
4074 case 'n': c = '\n'; break;
4075 // add any other escape sequences that show up in the test expected results.
4076 }
4077 resultString.append(c);
4078 perlExpr.remove(0, 1);
4079 }
b75a7d8f 4080
729e4ab9
A
4081 else {
4082 // Any characters from the perl expression that we don't explicitly
4083 // recognize before here are assumed to be literals and copied
4084 // as-is to the expected results.
4085 resultString.append(perlExpr.charAt(0));
4086 perlExpr.remove(0, 1);
4087 }
374ca955 4088
729e4ab9
A
4089 if (U_FAILURE(status)) {
4090 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4091 break;
4092 }
4093 }
b75a7d8f 4094
729e4ab9
A
4095 //
4096 // Expected Results Compare
4097 //
4098 UnicodeString expectedS(fields[4]);
4099 expectedS.findAndReplace(nulnulSrc, nulnul);
4100 expectedS.findAndReplace(ffffSrc, ffff);
4101 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4102
4103
729e4ab9
A
4104 if (expectedS.compare(resultString) != 0) {
4105 err("Line %d: Incorrect perl expression results.", lineNum);
4106 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4107 }
374ca955 4108
729e4ab9
A
4109 delete testMat;
4110 delete testPat;
b75a7d8f 4111 }
374ca955 4112
b75a7d8f 4113 //
729e4ab9 4114 // All done. Clean up allocated stuff.
b75a7d8f 4115 //
729e4ab9
A
4116 delete cgMat;
4117 delete cgPat;
374ca955 4118
729e4ab9
A
4119 delete groupsMat;
4120 delete groupsPat;
374ca955 4121
729e4ab9
A
4122 delete flagMat;
4123 delete flagPat;
374ca955 4124
729e4ab9
A
4125 delete lineMat;
4126 delete linePat;
374ca955 4127
729e4ab9
A
4128 delete fieldPat;
4129 delete [] testData;
374ca955 4130
374ca955 4131
729e4ab9 4132 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
b75a7d8f 4133
b75a7d8f
A
4134}
4135
4136
4137//-------------------------------------------------------------------------------
4138//
729e4ab9
A
4139// PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4140// (instead of using UnicodeStrings) to test the alternate engine.
4141// The input file for this test is re_tests, the standard regular
4142// expression test data distributed with the Perl source code.
4143// See PerlTests() for more information.
b75a7d8f
A
4144//
4145//-------------------------------------------------------------------------------
729e4ab9 4146void RegexTest::PerlTestsUTF8() {
374ca955
A
4147 char tdd[2048];
4148 const char *srcPath;
b75a7d8f
A
4149 UErrorCode status = U_ZERO_ERROR;
4150 UParseError pe;
729e4ab9
A
4151 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4152 UText patternText = UTEXT_INITIALIZER;
4153 char *patternChars = NULL;
4154 int32_t patternLength;
4155 int32_t patternCapacity = 0;
4156 UText inputText = UTEXT_INITIALIZER;
4157 char *inputChars = NULL;
4158 int32_t inputLength;
4159 int32_t inputCapacity = 0;
4160
4161 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
b75a7d8f
A
4162
4163 //
4164 // Open and read the test data file.
4165 //
374ca955
A
4166 srcPath=getPath(tdd, "re_tests.txt");
4167 if(srcPath==NULL) {
4168 return; /* something went wrong, error already output */
b75a7d8f
A
4169 }
4170
46f4442e
A
4171 int32_t len;
4172 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
374ca955
A
4173 if (U_FAILURE(status)) {
4174 return; /* something went wrong, error already output */
4175 }
b75a7d8f
A
4176
4177 //
4178 // Put the test data into a UnicodeString
4179 //
4180 UnicodeString testDataString(FALSE, testData, len);
4181
4182 //
4183 // Regex to break the input file into lines, and strip the new lines.
4184 // One line per match, capture group one is the desired data.
4185 //
46f4442e 4186 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
73c04bcf
A
4187 if (U_FAILURE(status)) {
4188 dataerrln("RegexPattern::compile() error");
4189 return;
4190 }
b75a7d8f
A
4191 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4192
4193 //
4194 // Regex to split a test file line into fields.
4195 // There are six fields, separated by tabs.
4196 //
46f4442e 4197 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
b75a7d8f
A
4198
4199 //
4200 // Regex to identify test patterns with flag settings, and to separate them.
4201 // Test patterns with flags look like 'pattern'i
4202 // Test patterns without flags are not quoted: pattern
4203 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4204 //
46f4442e 4205 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
374ca955 4206 RegexMatcher* flagMat = flagPat->matcher(status);
b75a7d8f
A
4207
4208 //
4209 // The Perl tests reference several perl-isms, which are evaluated/substituted
4210 // in the test data. Not being perl, this must be done explicitly. Here
4211 // are string constants and REs for these constructs.
4212 //
4213 UnicodeString nulnulSrc("${nulnul}");
46f4442e 4214 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
b75a7d8f
A
4215 nulnul = nulnul.unescape();
4216
4217 UnicodeString ffffSrc("${ffff}");
46f4442e 4218 UnicodeString ffff("\\uffff", -1, US_INV);
b75a7d8f
A
4219 ffff = ffff.unescape();
4220
4221 // regexp for $-[0], $+[2], etc.
46f4442e 4222 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
374ca955
A
4223 RegexMatcher *groupsMat = groupsPat->matcher(status);
4224
b75a7d8f 4225 // regexp for $0, $1, $2, etc.
46f4442e 4226 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
374ca955 4227 RegexMatcher *cgMat = cgPat->matcher(status);
b75a7d8f
A
4228
4229
4230 //
4231 // Main Loop for the Perl Tests, runs once per line from the
4232 // test data file.
4233 //
4234 int32_t lineNum = 0;
4235 int32_t skippedUnimplementedCount = 0;
4236 while (lineMat->find()) {
4237 lineNum++;
4238
4239 //
4240 // Get a line, break it into its fields, do the Perl
4241 // variable substitutions.
4242 //
4243 UnicodeString line = lineMat->group(1, status);
4244 UnicodeString fields[7];
4245 fieldPat->split(line, fields, 7, status);
4246
4247 flagMat->reset(fields[0]);
4248 flagMat->matches(status);
4249 UnicodeString pattern = flagMat->group(2, status);
4250 pattern.findAndReplace("${bang}", "!");
46f4442e 4251 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
b75a7d8f
A
4252 pattern.findAndReplace(ffffSrc, ffff);
4253
4254 //
4255 // Identify patterns that include match flag settings,
4256 // split off the flags, remove the extra quotes.
4257 //
4258 UnicodeString flagStr = flagMat->group(3, status);
4259 if (U_FAILURE(status)) {
4260 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4261 return;
4262 }
4263 int32_t flags = 0;
4264 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4265 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4266 const UChar UChar_m = 0x6d;
4267 const UChar UChar_x = 0x78;
4268 const UChar UChar_y = 0x79;
4269 if (flagStr.indexOf(UChar_i) != -1) {
4270 flags |= UREGEX_CASE_INSENSITIVE;
4271 }
4272 if (flagStr.indexOf(UChar_m) != -1) {
4273 flags |= UREGEX_MULTILINE;
4274 }
4275 if (flagStr.indexOf(UChar_x) != -1) {
4276 flags |= UREGEX_COMMENTS;
4277 }
729e4ab9
A
4278
4279 //
4280 // Put the pattern in a UTF-8 UText
4281 //
4282 status = U_ZERO_ERROR;
4283 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4284 if (status == U_BUFFER_OVERFLOW_ERROR) {
4285 status = U_ZERO_ERROR;
4286 delete[] patternChars;
4287 patternCapacity = patternLength + 1;
4288 patternChars = new char[patternCapacity];
4289 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4290 }
4291 utext_openUTF8(&patternText, patternChars, patternLength, &status);
b75a7d8f
A
4292
4293 //
4294 // Compile the test pattern.
4295 //
729e4ab9 4296 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
b75a7d8f
A
4297 if (status == U_REGEX_UNIMPLEMENTED) {
4298 //
4299 // Test of a feature that is planned for ICU, but not yet implemented.
4300 // skip the test.
4301 skippedUnimplementedCount++;
4302 delete testPat;
4303 status = U_ZERO_ERROR;
4304 continue;
4305 }
4306
4307 if (U_FAILURE(status)) {
4308 // Some tests are supposed to generate errors.
4309 // Only report an error for tests that are supposed to succeed.
4310 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4311 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4312 {
4313 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4314 }
4315 status = U_ZERO_ERROR;
4316 delete testPat;
4317 continue;
4318 }
4319
4320 if (fields[2].indexOf(UChar_i) >= 0) {
4321 // ICU should skip this test.
4322 delete testPat;
4323 continue;
4324 }
4325
4326 if (fields[2].indexOf(UChar_c) >= 0) {
4327 // This pattern should have caused a compilation error, but didn't/
4328 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4329 delete testPat;
4330 continue;
4331 }
4332
729e4ab9 4333
b75a7d8f
A
4334 //
4335 // replace the Perl variables that appear in some of the
374ca955 4336 // match data strings.
b75a7d8f
A
4337 //
4338 UnicodeString matchString = fields[1];
4339 matchString.findAndReplace(nulnulSrc, nulnul);
4340 matchString.findAndReplace(ffffSrc, ffff);
4341
4342 // Replace any \n in the match string with an actual new-line char.
4343 // Don't do full unescape, as this unescapes more than Perl does, which
4344 // causes other spurious failures in the tests.
46f4442e 4345 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
374ca955 4346
729e4ab9
A
4347 //
4348 // Put the input in a UTF-8 UText
4349 //
4350 status = U_ZERO_ERROR;
4351 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4352 if (status == U_BUFFER_OVERFLOW_ERROR) {
4353 status = U_ZERO_ERROR;
4354 delete[] inputChars;
4355 inputCapacity = inputLength + 1;
4356 inputChars = new char[inputCapacity];
4357 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4358 }
4359 utext_openUTF8(&inputText, inputChars, inputLength, &status);
b75a7d8f
A
4360
4361 //
4362 // Run the test, check for expected match/don't match result.
4363 //
729e4ab9 4364 RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
b75a7d8f
A
4365 UBool found = testMat->find();
4366 UBool expected = FALSE;
4367 if (fields[2].indexOf(UChar_y) >=0) {
4368 expected = TRUE;
4369 }
4370 if (expected != found) {
374ca955 4371 errln("line %d: Expected %smatch, got %smatch",
b75a7d8f
A
4372 lineNum, expected?"":"no ", found?"":"no " );
4373 continue;
4374 }
46f4442e
A
4375
4376 // Don't try to check expected results if there is no match.
4377 // (Some have stuff in the expected fields)
4378 if (!found) {
4379 delete testMat;
4380 delete testPat;
4381 continue;
4382 }
b75a7d8f
A
4383
4384 //
4385 // Interpret the Perl expression from the fourth field of the data file,
4386 // building up an ICU string from the results of the ICU match.
374ca955 4387 // The Perl expression will contain references to the results of
b75a7d8f
A
4388 // a regex match, including the matched string, capture group strings,
4389 // group starting and ending indicies, etc.
4390 //
4391 UnicodeString resultString;
4392 UnicodeString perlExpr = fields[3];
b75a7d8f
A
4393
4394 while (perlExpr.length() > 0) {
729e4ab9
A
4395 groupsMat->reset(perlExpr);
4396 cgMat->reset(perlExpr);
4397
b75a7d8f
A
4398 if (perlExpr.startsWith("$&")) {
4399 resultString.append(testMat->group(status));
4400 perlExpr.remove(0, 2);
4401 }
4402
4403 else if (groupsMat->lookingAt(status)) {
4404 // $-[0] $+[2] etc.
4405 UnicodeString digitString = groupsMat->group(2, status);
4406 int32_t t = 0;
4407 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4408 UnicodeString plusOrMinus = groupsMat->group(1, status);
4409 int32_t matchPosition;
4410 if (plusOrMinus.compare("+") == 0) {
4411 matchPosition = testMat->end(groupNum, status);
4412 } else {
4413 matchPosition = testMat->start(groupNum, status);
4414 }
4415 if (matchPosition != -1) {
4416 ICU_Utility::appendNumber(resultString, matchPosition);
4417 }
4418 perlExpr.remove(0, groupsMat->end(status));
4419 }
4420
4421 else if (cgMat->lookingAt(status)) {
4422 // $1, $2, $3, etc.
4423 UnicodeString digitString = cgMat->group(1, status);
4424 int32_t t = 0;
4425 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4426 if (U_SUCCESS(status)) {
4427 resultString.append(testMat->group(groupNum, status));
4428 status = U_ZERO_ERROR;
4429 }
4430 perlExpr.remove(0, cgMat->end(status));
4431 }
4432
4433 else if (perlExpr.startsWith("@-")) {
46f4442e 4434 int32_t i;
b75a7d8f
A
4435 for (i=0; i<=testMat->groupCount(); i++) {
4436 if (i>0) {
4437 resultString.append(" ");
4438 }
4439 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4440 }
4441 perlExpr.remove(0, 2);
4442 }
4443
4444 else if (perlExpr.startsWith("@+")) {
46f4442e 4445 int32_t i;
b75a7d8f
A
4446 for (i=0; i<=testMat->groupCount(); i++) {
4447 if (i>0) {
4448 resultString.append(" ");
4449 }
4450 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4451 }
4452 perlExpr.remove(0, 2);
4453 }
4454
46f4442e 4455 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
b75a7d8f
A
4456 // or as an escaped sequence (e.g. \n)
4457 if (perlExpr.length() > 1) {
4458 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4459 }
4460 UChar c = perlExpr.charAt(0);
4461 switch (c) {
4462 case 'n': c = '\n'; break;
4463 // add any other escape sequences that show up in the test expected results.
4464 }
374ca955 4465 resultString.append(c);
b75a7d8f
A
4466 perlExpr.remove(0, 1);
4467 }
4468
4469 else {
4470 // Any characters from the perl expression that we don't explicitly
4471 // recognize before here are assumed to be literals and copied
4472 // as-is to the expected results.
4473 resultString.append(perlExpr.charAt(0));
4474 perlExpr.remove(0, 1);
4475 }
4476
4477 if (U_FAILURE(status)) {
4478 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4479 break;
4480 }
4481 }
374ca955 4482
b75a7d8f
A
4483 //
4484 // Expected Results Compare
4485 //
4486 UnicodeString expectedS(fields[4]);
4487 expectedS.findAndReplace(nulnulSrc, nulnul);
4488 expectedS.findAndReplace(ffffSrc, ffff);
46f4442e 4489 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
b75a7d8f
A
4490
4491
4492 if (expectedS.compare(resultString) != 0) {
73c04bcf 4493 err("Line %d: Incorrect perl expression results.", lineNum);
729e4ab9 4494 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
b75a7d8f
A
4495 }
4496
4497 delete testMat;
4498 delete testPat;
4499 }
4500
4501 //
4502 // All done. Clean up allocated stuff.
4503 //
4504 delete cgMat;
4505 delete cgPat;
374ca955 4506
b75a7d8f
A
4507 delete groupsMat;
4508 delete groupsPat;
374ca955 4509
b75a7d8f
A
4510 delete flagMat;
4511 delete flagPat;
4512
4513 delete lineMat;
4514 delete linePat;
374ca955 4515
b75a7d8f
A
4516 delete fieldPat;
4517 delete [] testData;
729e4ab9
A
4518
4519 utext_close(&patternText);
4520 utext_close(&inputText);
4521
4522 delete [] patternChars;
4523 delete [] inputChars;
374ca955 4524
b75a7d8f
A
4525
4526 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4527
4528}
4529
4530
729e4ab9
A
4531//--------------------------------------------------------------
4532//
4533// Bug6149 Verify limits to heap expansion for backtrack stack.
4534// Use this pattern,
4535// "(a?){1,}"
4536// The zero-length match will repeat forever.
4537// (That this goes into a loop is another bug)
4538//
4539//---------------------------------------------------------------
4540void RegexTest::Bug6149() {
4541 UnicodeString pattern("(a?){1,}");
4542 UnicodeString s("xyz");
4543 uint32_t flags = 0;
4544 UErrorCode status = U_ZERO_ERROR;
4545
4546 RegexMatcher matcher(pattern, s, flags, status);
4547 UBool result = false;
4548 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4549 REGEX_ASSERT(result == FALSE);
4550 }
4551
4552
46f4442e
A
4553//
4554// Callbacks() Test the callback function.
4555// When set, callbacks occur periodically during matching operations,
4556// giving the application code the ability to abort the operation
4557// before it's normal completion.
4558//
4559
4560struct callBackContext {
4561 RegexTest *test;
4562 int32_t maxCalls;
4563 int32_t numCalls;
4564 int32_t lastSteps;
4565 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4566};
4567
4568U_CDECL_BEGIN
4569static UBool U_CALLCONV
4570testCallBackFn(const void *context, int32_t steps) {
4571 callBackContext *info = (callBackContext *)context;
4572 if (info->lastSteps+1 != steps) {
4573 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4574 }
4575 info->lastSteps = steps;
4576 info->numCalls++;
4577 return (info->numCalls < info->maxCalls);
4578}
4579U_CDECL_END
4580
4581void RegexTest::Callbacks() {
4582 {
4583 // Getter returns NULLs if no callback has been set
4584
4585 // The variables that the getter will fill in.
4586 // Init to non-null values so that the action of the getter can be seen.
4587 const void *returnedContext = &returnedContext;
4588 URegexMatchCallback *returnedFn = &testCallBackFn;
4589
4590 UErrorCode status = U_ZERO_ERROR;
4591 RegexMatcher matcher("x", 0, status);
4592 REGEX_CHECK_STATUS;
4593 matcher.getMatchCallback(returnedFn, returnedContext, status);
4594 REGEX_CHECK_STATUS;
4595 REGEX_ASSERT(returnedFn == NULL);
4596 REGEX_ASSERT(returnedContext == NULL);
4597 }
4598
4599 {
4600 // Set and Get work
4601 callBackContext cbInfo = {this, 0, 0, 0};
4602 const void *returnedContext;
4603 URegexMatchCallback *returnedFn;
4604 UErrorCode status = U_ZERO_ERROR;
4605 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4606 REGEX_CHECK_STATUS;
4607 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4608 REGEX_CHECK_STATUS;
4609 matcher.getMatchCallback(returnedFn, returnedContext, status);
4610 REGEX_CHECK_STATUS;
4611 REGEX_ASSERT(returnedFn == testCallBackFn);
4612 REGEX_ASSERT(returnedContext == &cbInfo);
4613
4614 // A short-running match shouldn't invoke the callback
4615 status = U_ZERO_ERROR;
4616 cbInfo.reset(1);
4617 UnicodeString s = "xxx";
4618 matcher.reset(s);
4619 REGEX_ASSERT(matcher.matches(status));
4620 REGEX_CHECK_STATUS;
4621 REGEX_ASSERT(cbInfo.numCalls == 0);
4622
4623 // A medium-length match that runs long enough to invoke the
4624 // callback, but not so long that the callback aborts it.
4625 status = U_ZERO_ERROR;
4626 cbInfo.reset(4);
4627 s = "aaaaaaaaaaaaaaaaaaab";
4628 matcher.reset(s);
4629 REGEX_ASSERT(matcher.matches(status)==FALSE);
4630 REGEX_CHECK_STATUS;
4631 REGEX_ASSERT(cbInfo.numCalls > 0);
4632
4633 // A longer running match that the callback function will abort.
4634 status = U_ZERO_ERROR;
4635 cbInfo.reset(4);
4636 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4637 matcher.reset(s);
4638 REGEX_ASSERT(matcher.matches(status)==FALSE);
4639 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4640 REGEX_ASSERT(cbInfo.numCalls == 4);
4641 }
4642
4643
4644}
b75a7d8f 4645
729e4ab9
A
4646
4647//
4648// FindProgressCallbacks() Test the find "progress" callback function.
4649// When set, the find progress callback will be invoked during a find operations
4650// after each return from a match attempt, giving the application the opportunity
4651// to terminate a long-running find operation before it's normal completion.
4652//
4653
4654struct progressCallBackContext {
4655 RegexTest *test;
4656 int64_t lastIndex;
4657 int32_t maxCalls;
4658 int32_t numCalls;
4659 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4660};
4661
4662U_CDECL_BEGIN
4663static UBool U_CALLCONV
4664testProgressCallBackFn(const void *context, int64_t matchIndex) {
4665 progressCallBackContext *info = (progressCallBackContext *)context;
4666 info->numCalls++;
4667 info->lastIndex = matchIndex;
4668// info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4669 return (info->numCalls < info->maxCalls);
4670}
4671U_CDECL_END
4672
4673void RegexTest::FindProgressCallbacks() {
4674 {
4675 // Getter returns NULLs if no callback has been set
4676
4677 // The variables that the getter will fill in.
4678 // Init to non-null values so that the action of the getter can be seen.
4679 const void *returnedContext = &returnedContext;
4680 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4681
4682 UErrorCode status = U_ZERO_ERROR;
4683 RegexMatcher matcher("x", 0, status);
4684 REGEX_CHECK_STATUS;
4685 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4686 REGEX_CHECK_STATUS;
4687 REGEX_ASSERT(returnedFn == NULL);
4688 REGEX_ASSERT(returnedContext == NULL);
4689 }
4690
4691 {
4692 // Set and Get work
4693 progressCallBackContext cbInfo = {this, 0, 0, 0};
4694 const void *returnedContext;
4695 URegexFindProgressCallback *returnedFn;
4696 UErrorCode status = U_ZERO_ERROR;
4697 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4698 REGEX_CHECK_STATUS;
4699 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4700 REGEX_CHECK_STATUS;
4701 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4702 REGEX_CHECK_STATUS;
4703 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4704 REGEX_ASSERT(returnedContext == &cbInfo);
4705
4706 // A short-running match should NOT invoke the callback.
4707 status = U_ZERO_ERROR;
4708 cbInfo.reset(100);
4709 UnicodeString s = "abxxx";
4710 matcher.reset(s);
4711#if 0
4712 matcher.setTrace(TRUE);
4713#endif
4714 REGEX_ASSERT(matcher.find(0, status));
4715 REGEX_CHECK_STATUS;
4716 REGEX_ASSERT(cbInfo.numCalls == 0);
4717
4718 // A medium running match that causes matcher.find() to invoke our callback for each index.
4719 status = U_ZERO_ERROR;
4720 s = "aaaaaaaaaaaaaaaaaaab";
4721 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4722 matcher.reset(s);
4723 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4724 REGEX_CHECK_STATUS;
4725 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4726
4727 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4728 status = U_ZERO_ERROR;
4729 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4730 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4731 matcher.reset(s1);
4732 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4733 REGEX_CHECK_STATUS;
4734 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4735
4736#if 0
4737 // Now a match that will succeed, but after an interruption
4738 status = U_ZERO_ERROR;
4739 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4740 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
4741 matcher.reset(s2);
4742 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4743 REGEX_CHECK_STATUS;
4744 // Now retry the match from where left off
4745 cbInfo.maxCalls = 100; // No callback limit
4746 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4747 REGEX_CHECK_STATUS;
4748#endif
4749 }
4750
4751
4752}
4753
4754
4755//---------------------------------------------------------------------------
4756//
4757// PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
4758// UTexts. The pure-C implementation of UText
4759// has no mutable backing stores, but we can
4760// use UnicodeString here to test the functionality.
4761//
4762//---------------------------------------------------------------------------
4763void RegexTest::PreAllocatedUTextCAPI () {
4764 UErrorCode status = U_ZERO_ERROR;
4765 URegularExpression *re;
4766 UText patternText = UTEXT_INITIALIZER;
4767 UnicodeString buffer;
4768 UText bufferText = UTEXT_INITIALIZER;
4769
4770 utext_openUnicodeString(&bufferText, &buffer, &status);
4771
4772 /*
4773 * getText() and getUText()
4774 */
4775 {
4776 UText text1 = UTEXT_INITIALIZER;
4777 UText text2 = UTEXT_INITIALIZER;
4778 UChar text2Chars[20];
4779 UText *resultText;
4780
4781 status = U_ZERO_ERROR;
4782 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4783 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4784 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4785 utext_openUChars(&text2, text2Chars, -1, &status);
4786
4787 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4788 re = uregex_openUText(&patternText, 0, NULL, &status);
4789
4790 /* First set a UText */
4791 uregex_setUText(re, &text1, &status);
4792 resultText = uregex_getUText(re, &bufferText, &status);
4793 REGEX_CHECK_STATUS;
4794 REGEX_ASSERT(resultText == &bufferText);
4795 utext_setNativeIndex(resultText, 0);
4796 utext_setNativeIndex(&text1, 0);
4797 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4798
4799 resultText = uregex_getUText(re, &bufferText, &status);
4800 REGEX_CHECK_STATUS;
4801 REGEX_ASSERT(resultText == &bufferText);
4802 utext_setNativeIndex(resultText, 0);
4803 utext_setNativeIndex(&text1, 0);
4804 REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4805
4806 /* Then set a UChar * */
4807 uregex_setText(re, text2Chars, 7, &status);
4808 resultText = uregex_getUText(re, &bufferText, &status);
4809 REGEX_CHECK_STATUS;
4810 REGEX_ASSERT(resultText == &bufferText);
4811 utext_setNativeIndex(resultText, 0);
4812 utext_setNativeIndex(&text2, 0);
4813 REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4814
4815 uregex_close(re);
4816 utext_close(&text1);
4817 utext_close(&text2);
4818 }
4819
4820 /*
4821 * group()
4822 */
4823 {
4824 UChar text1[80];
4825 UText *actual;
4826 UBool result;
4827 u_uastrncpy(text1, "noise abc interior def, and this is off the end", sizeof(text1)/2);
4828
4829 status = U_ZERO_ERROR;
4830 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4831 REGEX_CHECK_STATUS;
4832
4833 uregex_setText(re, text1, -1, &status);
4834 result = uregex_find(re, 0, &status);
4835 REGEX_ASSERT(result==TRUE);
4836
4837 /* Capture Group 0, the full match. Should succeed. */
4838 status = U_ZERO_ERROR;
4839 actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4840 REGEX_CHECK_STATUS;
4841 REGEX_ASSERT(actual == &bufferText);
4842 REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4843
4844 /* Capture group #1. Should succeed. */
4845 status = U_ZERO_ERROR;
4846 actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4847 REGEX_CHECK_STATUS;
4848 REGEX_ASSERT(actual == &bufferText);
4849 REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4850
4851 /* Capture group out of range. Error. */
4852 status = U_ZERO_ERROR;
4853 actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4854 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4855 REGEX_ASSERT(actual == &bufferText);
4856
4857 uregex_close(re);
4858
4859 }
4860
4861 /*
4862 * replaceFirst()
4863 */
4864 {
4865 UChar text1[80];
4866 UChar text2[80];
4867 UText replText = UTEXT_INITIALIZER;
4868 UText *result;
4869
4870 status = U_ZERO_ERROR;
4871 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
4872 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
4873 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4874
4875 re = uregex_openC("x(.*?)x", 0, NULL, &status);
4876 REGEX_CHECK_STATUS;
4877
4878 /* Normal case, with match */
4879 uregex_setText(re, text1, -1, &status);
4880 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4881 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4882 REGEX_CHECK_STATUS;
4883 REGEX_ASSERT(result == &bufferText);
4884 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
4885
4886 /* No match. Text should copy to output with no changes. */
4887 uregex_setText(re, text2, -1, &status);
4888 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4889 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4890 REGEX_CHECK_STATUS;
4891 REGEX_ASSERT(result == &bufferText);
4892 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4893
4894 /* Unicode escapes */
4895 uregex_setText(re, text1, -1, &status);
4896 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4897 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4898 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4899 REGEX_CHECK_STATUS;
4900 REGEX_ASSERT(result == &bufferText);
4901 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
4902
4903 uregex_close(re);
4904 utext_close(&replText);
4905 }
4906
4907
4908 /*
4909 * replaceAll()
4910 */
4911 {
4912 UChar text1[80];
4913 UChar text2[80];
4914 UText replText = UTEXT_INITIALIZER;
4915 UText *result;
4916
4917 status = U_ZERO_ERROR;
4918 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
4919 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
4920 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4921
4922 re = uregex_openC("x(.*?)x", 0, NULL, &status);
4923 REGEX_CHECK_STATUS;
4924
4925 /* Normal case, with match */
4926 uregex_setText(re, text1, -1, &status);
4927 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4928 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4929 REGEX_CHECK_STATUS;
4930 REGEX_ASSERT(result == &bufferText);
4931 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
4932
4933 /* No match. Text should copy to output with no changes. */
4934 uregex_setText(re, text2, -1, &status);
4935 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4936 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4937 REGEX_CHECK_STATUS;
4938 REGEX_ASSERT(result == &bufferText);
4939 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4940
4941 uregex_close(re);
4942 utext_close(&replText);
4943 }
4944
4945
4946 /*
4947 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4948 * so we don't need to test it here.
4949 */
4950
4951 utext_close(&bufferText);
4952 utext_close(&patternText);
4953}
4954
4955//--------------------------------------------------------------
4956//
4957// Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
4958//
4959//---------------------------------------------------------------
4960void RegexTest::Bug7651() {
4961 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4962 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4963 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
4964 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4965 UnicodeString s("#ff @abcd This is test");
4966 RegexPattern *REPattern = NULL;
4967 RegexMatcher *REMatcher = NULL;
4968 UErrorCode status = U_ZERO_ERROR;
4969 UParseError pe;
4970
4971 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4972 REGEX_CHECK_STATUS;
4973 REMatcher = REPattern->matcher(s, status);
4974 REGEX_CHECK_STATUS;
4975 REGEX_ASSERT(REMatcher->find());
4976 REGEX_ASSERT(REMatcher->start(status) == 0);
4977 delete REPattern;
4978 delete REMatcher;
4979 status = U_ZERO_ERROR;
4980
4981 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4982 REGEX_CHECK_STATUS;
4983 REMatcher = REPattern->matcher(s, status);
4984 REGEX_CHECK_STATUS;
4985 REGEX_ASSERT(REMatcher->find());
4986 REGEX_ASSERT(REMatcher->start(status) == 0);
4987 delete REPattern;
4988 delete REMatcher;
4989 status = U_ZERO_ERROR;
4990 }
4991
4992void RegexTest::Bug7740() {
4993 UErrorCode status = U_ZERO_ERROR;
4994 UnicodeString pattern = "(a)";
4995 UnicodeString text = "abcdef";
4996 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
4997 REGEX_CHECK_STATUS;
4998 REGEX_ASSERT(m->lookingAt(status));
4999 REGEX_CHECK_STATUS;
5000 status = U_ILLEGAL_ARGUMENT_ERROR;
5001 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5002 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5003 REGEX_ASSERT(s == "");
5004 delete m;
5005}
5006
5007
5008
5009
b75a7d8f
A
5010#endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5011