icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2010, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 #include "intltest.h"
  14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  15
  16 #include "unicode/regex.h"
  17 #include "unicode/uchar.h"
  18 #include "unicode/ucnv.h"
  19 #include "unicode/ustring.h"
  20 #include "regextst.h"
  21 #include "uvector.h"
  22 #include "util.h"
  23 #include <stdlib.h>
  24 #include <string.h>
  25 #include <stdio.h>
  26 #include "cstring.h"
  27 #include "uinvchar.h"
  28
  29 #define SUPPORT_MUTATING_INPUT_STRING   0
  30
  31 //---------------------------------------------------------------------------
  32 //
  33 //  Test class boilerplate
  34 //
  35 //---------------------------------------------------------------------------
  36 RegexTest::RegexTest()
  37 {
  38 }
  39
  40
  41 RegexTest::~RegexTest()
  42 {
  43 }
  44
  45
  46
  47 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  48 {
  49     if (exec) logln("TestSuite RegexTest: ");
  50     switch (index) {
  51
  52         case 0: name = "Basic";
  53             if (exec) Basic();
  54             break;
  55         case 1: name = "API_Match";
  56             if (exec) API_Match();
  57             break;
  58         case 2: name = "API_Replace";
  59             if (exec) API_Replace();
  60             break;
  61         case 3: name = "API_Pattern";
  62             if (exec) API_Pattern();
  63             break;
  64         case 4:
  65 #if !UCONFIG_NO_FILE_IO
  66             name = "Extended";
  67             if (exec) Extended();
  68 #else
  69             name = "skip";
  70 #endif
  71             break;
  72         case 5: name = "Errors";
  73             if (exec) Errors();
  74             break;
  75         case 6: name = "PerlTests";
  76             if (exec) PerlTests();
  77             break;
  78         case 7: name = "Callbacks";
  79             if (exec) Callbacks();
  80             break;
  81         case 8: name = "FindProgressCallbacks";
  82             if (exec) FindProgressCallbacks();
  83             break;
  84         case 9: name = "Bug 6149";
  85              if (exec) Bug6149();
  86              break;
  87         case 10: name = "UTextBasic";
  88           if (exec) UTextBasic();
  89           break;
  90         case 11: name = "API_Match_UTF8";
  91           if (exec) API_Match_UTF8();
  92           break;
  93         case 12: name = "API_Replace_UTF8";
  94           if (exec) API_Replace_UTF8();
  95           break;
  96         case 13: name = "API_Pattern_UTF8";
  97           if (exec) API_Pattern_UTF8();
  98           break;
  99         case 14: name = "PerlTestsUTF8";
 100           if (exec) PerlTestsUTF8();
 101           break;
 102         case 15: name = "PreAllocatedUTextCAPI";
 103           if (exec) PreAllocatedUTextCAPI();
 104           break;
 105         case 16: name = "Bug 7651";
 106              if (exec) Bug7651();
 107              break;
 108         case 17: name = "Bug 7740";
 109             if (exec) Bug7740();
 110             break;
 111
 112         default: name = "";
 113             break; //needed to end loop
 114     }
 115 }
 116
 117
 118 /**
 119  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 120  * into ASCII.
 121  * @see utext_openUTF8
 122  */
 123 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 124
 125 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 126 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 127   return utext_openUTF8(ut, inv, length, status);
 128 #else
 129   char buf[1024];
 130
 131   uprv_aestrncpy((uint8_t*)buf, (const uint8_t*)inv, length);
 132
 133   return utext_openUTF8(ut, buf, length, status);
 134 #endif
 135 }
 136
 137 //---------------------------------------------------------------------------
 138 //
 139 //   Error Checking / Reporting macros used in all of the tests.
 140 //
 141 //---------------------------------------------------------------------------
 142
 143 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 144   int64_t oldIndex = utext_getNativeIndex(text);
 145   utext_setNativeIndex(text, 0);
 146   char *bufPtr = buf;
 147   UChar32 c = utext_next32From(text, 0);
 148   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 149     if (0x000020<=c && c<0x00007e) {
 150       *bufPtr = c;
 151     } else {
 152 #if 0
 153       sprintf(bufPtr,"U+%04X", c);
 154       bufPtr+= strlen(bufPtr)-1;
 155 #else
 156       *bufPtr = '%';
 157 #endif
 158     }
 159     bufPtr++;
 160     c = UTEXT_NEXT32(text);
 161   }
 162   *bufPtr = 0;
 163 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 164   char *ebuf = (char*)malloc(bufLen);
 165   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 166   uprv_strncpy(buf, ebuf, bufLen);
 167   free((void*)ebuf);
 168 #endif
 169   utext_setNativeIndex(text, oldIndex);
 170 }
 171
 172 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 173
 174 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 175                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 176
 177 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 178
 179 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 180 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 181     __LINE__, u_errorName(errcode), u_errorName(status));};}
 182
 183 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 184     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 185
 186 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 187     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 188
 189 /**
 190  * @param expected expected text in UTF-8 (not platform) codepage
 191  */
 192 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 193     UErrorCode status = U_ZERO_ERROR;
 194     UText expectedText = UTEXT_INITIALIZER;
 195     utext_openUTF8(&expectedText, expected, -1, &status);
 196     if(U_FAILURE(status)) {
 197       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 198       return;
 199     }
 200     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 201       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 202       return;
 203     }
 204     utext_setNativeIndex(actual, 0);
 205     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
 206         char buf[201 /*21*/];
 207         char expectedBuf[201];
 208         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 209         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 210         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 211     }
 212     utext_close(&expectedText);
 213 }
 214 /**
 215  * @param expected invariant (platform local text) input
 216  */
 217
 218 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 219     UErrorCode status = U_ZERO_ERROR;
 220     UText expectedText = UTEXT_INITIALIZER;
 221     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 222     if(U_FAILURE(status)) {
 223       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 224       return;
 225     }
 226     utext_setNativeIndex(actual, 0);
 227     if (utext_compare(&expectedText, -1, actual, -1) != 0) {
 228         char buf[201 /*21*/];
 229         char expectedBuf[201];
 230         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 231         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 232         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 233     }
 234     utext_close(&expectedText);
 235 }
 236
 237 /**
 238  * Assumes utf-8 input
 239  */
 240 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 241 /**
 242  * Assumes Invariant input
 243  */
 244 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 245
 246
 247 //---------------------------------------------------------------------------
 248 //
 249 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 250 //                       for the LookingAt() and  Match() functions.
 251 //
 252 //       usage:
 253 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 254 //
 255 //          The expected results are UBool - TRUE or FALSE.
 256 //          The input text is unescaped.  The pattern is not.
 257 //
 258 //
 259 //---------------------------------------------------------------------------
 260
 261 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 262
 263 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 264     const UnicodeString pattern(pat, -1, US_INV);
 265     const UnicodeString inputText(text, -1, US_INV);
 266     UErrorCode          status  = U_ZERO_ERROR;
 267     UParseError         pe;
 268     RegexPattern        *REPattern = NULL;
 269     RegexMatcher        *REMatcher = NULL;
 270     UBool               retVal     = TRUE;
 271
 272     UnicodeString patString(pat, -1, US_INV);
 273     REPattern = RegexPattern::compile(patString, 0, pe, status);
 274     if (U_FAILURE(status)) {
 275         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 276             line, u_errorName(status));
 277         return FALSE;
 278     }
 279     if (line==376) { RegexPatternDump(REPattern);}
 280
 281     UnicodeString inputString(inputText);
 282     UnicodeString unEscapedInput = inputString.unescape();
 283     REMatcher = REPattern->matcher(unEscapedInput, status);
 284     if (U_FAILURE(status)) {
 285         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 286             line, u_errorName(status));
 287         return FALSE;
 288     }
 289
 290     UBool actualmatch;
 291     actualmatch = REMatcher->lookingAt(status);
 292     if (U_FAILURE(status)) {
 293         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 294             line, u_errorName(status));
 295         retVal =  FALSE;
 296     }
 297     if (actualmatch != looking) {
 298         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 299         retVal = FALSE;
 300     }
 301
 302     status = U_ZERO_ERROR;
 303     actualmatch = REMatcher->matches(status);
 304     if (U_FAILURE(status)) {
 305         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 306             line, u_errorName(status));
 307         retVal = FALSE;
 308     }
 309     if (actualmatch != match) {
 310         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 311         retVal = FALSE;
 312     }
 313
 314     if (retVal == FALSE) {
 315         RegexPatternDump(REPattern);
 316     }
 317
 318     delete REPattern;
 319     delete REMatcher;
 320     return retVal;
 321 }
 322
 323
 324 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 325     UText               pattern    = UTEXT_INITIALIZER;
 326     int32_t             inputUTF8Length;
 327     char                *textChars = NULL;
 328     UText               inputText  = UTEXT_INITIALIZER;
 329     UErrorCode          status     = U_ZERO_ERROR;
 330     UParseError         pe;
 331     RegexPattern        *REPattern = NULL;
 332     RegexMatcher        *REMatcher = NULL;
 333     UBool               retVal     = TRUE;
 334
 335     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 336     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 337     if (U_FAILURE(status)) {
 338         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 339             line, u_errorName(status));
 340         return FALSE;
 341     }
 342
 343     UnicodeString inputString(text, -1, US_INV);
 344     UnicodeString unEscapedInput = inputString.unescape();
 345     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 346     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 347
 348     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 349     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 350         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 351         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 352         return TRUE; // not a failure of the Regex engine
 353     }
 354     status = U_ZERO_ERROR; // buffer overflow
 355     textChars = new char[inputUTF8Length+1];
 356     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 357     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 358
 359     REMatcher = REPattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
 360     if (U_FAILURE(status)) {
 361         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 362             line, u_errorName(status));
 363         return FALSE;
 364     }
 365
 366     UBool actualmatch;
 367     actualmatch = REMatcher->lookingAt(status);
 368     if (U_FAILURE(status)) {
 369         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 370             line, u_errorName(status));
 371         retVal =  FALSE;
 372     }
 373     if (actualmatch != looking) {
 374         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 375         retVal = FALSE;
 376     }
 377
 378     status = U_ZERO_ERROR;
 379     actualmatch = REMatcher->matches(status);
 380     if (U_FAILURE(status)) {
 381         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 382             line, u_errorName(status));
 383         retVal = FALSE;
 384     }
 385     if (actualmatch != match) {
 386         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 387         retVal = FALSE;
 388     }
 389
 390     if (retVal == FALSE) {
 391         RegexPatternDump(REPattern);
 392     }
 393
 394     delete REPattern;
 395     delete REMatcher;
 396     utext_close(&inputText);
 397     utext_close(&pattern);
 398     delete[] textChars;
 399     return retVal;
 400 }
 401
 402
 403
 404 //---------------------------------------------------------------------------
 405 //
 406 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 407 //                       regex tests for incorrect patterns
 408 //
 409 //       usage:
 410 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 411 //
 412 //---------------------------------------------------------------------------
 413 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 414
 415 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 416                           UErrorCode expectedStatus, int32_t line) {
 417     UnicodeString       pattern(pat);
 418
 419     UErrorCode          status         = U_ZERO_ERROR;
 420     UParseError         pe;
 421     RegexPattern        *callerPattern = NULL;
 422
 423     //
 424     //  Compile the caller's pattern
 425     //
 426     UnicodeString patString(pat);
 427     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 428     if (status != expectedStatus) {
 429         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 430     } else {
 431         if (status != U_ZERO_ERROR) {
 432             if (pe.line != errLine || pe.offset != errCol) {
 433                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 434                     line, errLine, errCol, pe.line, pe.offset);
 435             }
 436         }
 437     }
 438
 439     delete callerPattern;
 440
 441     //
 442     //  Compile again, using a UTF-8-based UText
 443     //
 444     UText patternText = UTEXT_INITIALIZER;
 445     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 446     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 447     if (status != expectedStatus) {
 448         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 449     } else {
 450         if (status != U_ZERO_ERROR) {
 451             if (pe.line != errLine || pe.offset != errCol) {
 452                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 453                     line, errLine, errCol, pe.line, pe.offset);
 454             }
 455         }
 456     }
 457
 458     delete callerPattern;
 459     utext_close(&patternText);
 460 }
 461
 462
 463
 464 //---------------------------------------------------------------------------
 465 //
 466 //      Basic      Check for basic functionality of regex pattern matching.
 467 //                 Avoid the use of REGEX_FIND test macro, which has
 468 //                 substantial dependencies on basic Regex functionality.
 469 //
 470 //---------------------------------------------------------------------------
 471 void RegexTest::Basic() {
 472
 473
 474 //
 475 // Debug - slide failing test cases early
 476 //
 477 #if 0
 478     {
 479         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 480         UParseError pe;
 481         UErrorCode  status = U_ZERO_ERROR;
 482         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
 483         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 484         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 485     }
 486     exit(1);
 487 #endif
 488
 489
 490     //
 491     // Pattern with parentheses
 492     //
 493     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 494     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 495     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 496
 497     //
 498     // Patterns with *
 499     //
 500     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 501     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 502     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 503     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 504     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 505
 506     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 507     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 508
 509
 510     //
 511     //  Patterns with "."
 512     //
 513     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 514     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 515     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 516     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 517     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 518     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 519     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 520     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 521
 522     //
 523     //  Patterns with * applied to chars at end of literal string
 524     //
 525     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 526     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 527
 528     //
 529     //  Supplemental chars match as single chars, not a pair of surrogates.
 530     //
 531     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 532     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 533     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 534
 535
 536     //
 537     //  UnicodeSets in the pattern
 538     //
 539     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 540     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 541     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 542     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 543     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 544     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 545
 546     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 547     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 548     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 549     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 550     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 551
 552     //
 553     //   OR operator in patterns
 554     //
 555     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 556     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 557     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 558     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 559
 560     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 561     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 562     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 563     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 564     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 565     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 566
 567     //
 568     //  +
 569     //
 570     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 571     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 572     REGEX_TESTLM("b+", "", FALSE, FALSE);
 573     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 574     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 575     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 576
 577     //
 578     //   ?
 579     //
 580     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 581     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 582     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 583     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 584     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 585     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 586     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 587     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 588     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 589
 590     //
 591     //  Escape sequences that become single literal chars, handled internally
 592     //   by ICU's Unescape.
 593     //
 594
 595     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 596     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 597     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 598     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 599     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 600     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 601     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 602     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 603     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 604     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 605
 606     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 607     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 608
 609     // Escape of special chars in patterns
 610     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 611 }
 612
 613
 614 //---------------------------------------------------------------------------
 615 //
 616 //    UTextBasic   Check for quirks that are specific to the UText
 617 //                 implementation.
 618 //
 619 //---------------------------------------------------------------------------
 620 void RegexTest::UTextBasic() {
 621     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 622     UErrorCode status = U_ZERO_ERROR;
 623     UText pattern = UTEXT_INITIALIZER;
 624     utext_openUTF8(&pattern, str_abc, -1, &status);
 625     RegexMatcher matcher(&pattern, 0, status);
 626     REGEX_CHECK_STATUS;
 627
 628     UText input = UTEXT_INITIALIZER;
 629     utext_openUTF8(&input, str_abc, -1, &status);
 630     REGEX_CHECK_STATUS;
 631     matcher.reset(&input);
 632     REGEX_CHECK_STATUS;
 633     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 634
 635     matcher.reset(matcher.inputText());
 636     REGEX_CHECK_STATUS;
 637     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 638
 639     utext_close(&pattern);
 640     utext_close(&input);
 641 }
 642
 643
 644 //---------------------------------------------------------------------------
 645 //
 646 //      API_Match   Test that the API for class RegexMatcher
 647 //                  is present and nominally working, but excluding functions
 648 //                  implementing replace operations.
 649 //
 650 //---------------------------------------------------------------------------
 651 void RegexTest::API_Match() {
 652     UParseError         pe;
 653     UErrorCode          status=U_ZERO_ERROR;
 654     int32_t             flags = 0;
 655
 656     //
 657     // Debug - slide failing test cases early
 658     //
 659 #if 0
 660     {
 661     }
 662     return;
 663 #endif
 664
 665     //
 666     // Simple pattern compilation
 667     //
 668     {
 669         UnicodeString       re("abc");
 670         RegexPattern        *pat2;
 671         pat2 = RegexPattern::compile(re, flags, pe, status);
 672         REGEX_CHECK_STATUS;
 673
 674         UnicodeString inStr1 = "abcdef this is a test";
 675         UnicodeString instr2 = "not abc";
 676         UnicodeString empty  = "";
 677
 678
 679         //
 680         // Matcher creation and reset.
 681         //
 682         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 683         REGEX_CHECK_STATUS;
 684         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 685         REGEX_ASSERT(m1->input() == inStr1);
 686         m1->reset(instr2);
 687         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 688         REGEX_ASSERT(m1->input() == instr2);
 689         m1->reset(inStr1);
 690         REGEX_ASSERT(m1->input() == inStr1);
 691         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 692         m1->reset(empty);
 693         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 694         REGEX_ASSERT(m1->input() == empty);
 695         REGEX_ASSERT(&m1->pattern() == pat2);
 696
 697         //
 698         //  reset(pos, status)
 699         //
 700         m1->reset(inStr1);
 701         m1->reset(4, status);
 702         REGEX_CHECK_STATUS;
 703         REGEX_ASSERT(m1->input() == inStr1);
 704         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 705
 706         m1->reset(-1, status);
 707         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 708         status = U_ZERO_ERROR;
 709
 710         m1->reset(0, status);
 711         REGEX_CHECK_STATUS;
 712         status = U_ZERO_ERROR;
 713
 714         int32_t len = m1->input().length();
 715         m1->reset(len-1, status);
 716         REGEX_CHECK_STATUS;
 717         status = U_ZERO_ERROR;
 718
 719         m1->reset(len, status);
 720         REGEX_CHECK_STATUS;
 721         status = U_ZERO_ERROR;
 722
 723         m1->reset(len+1, status);
 724         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 725         status = U_ZERO_ERROR;
 726
 727         //
 728         // match(pos, status)
 729         //
 730         m1->reset(instr2);
 731         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 732         m1->reset();
 733         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 734         m1->reset();
 735         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 736         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 737         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 738         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 739
 740         // Match() at end of string should fail, but should not
 741         //  be an error.
 742         status = U_ZERO_ERROR;
 743         len = m1->input().length();
 744         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 745         REGEX_CHECK_STATUS;
 746
 747         // Match beyond end of string should fail with an error.
 748         status = U_ZERO_ERROR;
 749         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 750         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 751
 752         // Successful match at end of string.
 753         {
 754             status = U_ZERO_ERROR;
 755             RegexMatcher m("A?", 0, status);  // will match zero length string.
 756             REGEX_CHECK_STATUS;
 757             m.reset(inStr1);
 758             len = inStr1.length();
 759             REGEX_ASSERT(m.matches(len, status) == TRUE);
 760             REGEX_CHECK_STATUS;
 761             m.reset(empty);
 762             REGEX_ASSERT(m.matches(0, status) == TRUE);
 763             REGEX_CHECK_STATUS;
 764         }
 765
 766
 767         //
 768         // lookingAt(pos, status)
 769         //
 770         status = U_ZERO_ERROR;
 771         m1->reset(instr2);  // "not abc"
 772         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 773         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 774         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 775         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 776         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 777         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 778         status = U_ZERO_ERROR;
 779         len = m1->input().length();
 780         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 781         REGEX_CHECK_STATUS;
 782         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 783         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 784
 785         delete m1;
 786         delete pat2;
 787     }
 788
 789
 790     //
 791     // Capture Group.
 792     //     RegexMatcher::start();
 793     //     RegexMatcher::end();
 794     //     RegexMatcher::groupCount();
 795     //
 796     {
 797         int32_t             flags=0;
 798         UParseError         pe;
 799         UErrorCode          status=U_ZERO_ERROR;
 800
 801         UnicodeString       re("01(23(45)67)(.*)");
 802         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 803         REGEX_CHECK_STATUS;
 804         UnicodeString data = "0123456789";
 805
 806         RegexMatcher *matcher = pat->matcher(data, status);
 807         REGEX_CHECK_STATUS;
 808         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 809         static const int32_t matchStarts[] = {0,  2, 4, 8};
 810         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 811         int32_t i;
 812         for (i=0; i<4; i++) {
 813             int32_t actualStart = matcher->start(i, status);
 814             REGEX_CHECK_STATUS;
 815             if (actualStart != matchStarts[i]) {
 816                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 817                     __LINE__, i, matchStarts[i], actualStart);
 818             }
 819             int32_t actualEnd = matcher->end(i, status);
 820             REGEX_CHECK_STATUS;
 821             if (actualEnd != matchEnds[i]) {
 822                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 823                     __LINE__, i, matchEnds[i], actualEnd);
 824             }
 825         }
 826
 827         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 828         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 829
 830         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 831         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 832         matcher->reset();
 833         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 834
 835         matcher->lookingAt(status);
 836         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 837         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 838         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 839         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 840         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 841         REGEX_CHECK_STATUS;
 842         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 843         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 844         matcher->reset();
 845         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 846
 847         delete matcher;
 848         delete pat;
 849
 850     }
 851
 852     //
 853     //  find
 854     //
 855     {
 856         int32_t             flags=0;
 857         UParseError         pe;
 858         UErrorCode          status=U_ZERO_ERROR;
 859
 860         UnicodeString       re("abc");
 861         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 862         REGEX_CHECK_STATUS;
 863         UnicodeString data = ".abc..abc...abc..";
 864         //                    012345678901234567
 865
 866         RegexMatcher *matcher = pat->matcher(data, status);
 867         REGEX_CHECK_STATUS;
 868         REGEX_ASSERT(matcher->find());
 869         REGEX_ASSERT(matcher->start(status) == 1);
 870         REGEX_ASSERT(matcher->find());
 871         REGEX_ASSERT(matcher->start(status) == 6);
 872         REGEX_ASSERT(matcher->find());
 873         REGEX_ASSERT(matcher->start(status) == 12);
 874         REGEX_ASSERT(matcher->find() == FALSE);
 875         REGEX_ASSERT(matcher->find() == FALSE);
 876
 877         matcher->reset();
 878         REGEX_ASSERT(matcher->find());
 879         REGEX_ASSERT(matcher->start(status) == 1);
 880
 881         REGEX_ASSERT(matcher->find(0, status));
 882         REGEX_ASSERT(matcher->start(status) == 1);
 883         REGEX_ASSERT(matcher->find(1, status));
 884         REGEX_ASSERT(matcher->start(status) == 1);
 885         REGEX_ASSERT(matcher->find(2, status));
 886         REGEX_ASSERT(matcher->start(status) == 6);
 887         REGEX_ASSERT(matcher->find(12, status));
 888         REGEX_ASSERT(matcher->start(status) == 12);
 889         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 890         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 891         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 892         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
 893
 894         status = U_ZERO_ERROR;
 895         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 896         status = U_ZERO_ERROR;
 897         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
 898
 899         REGEX_ASSERT(matcher->groupCount() == 0);
 900
 901         delete matcher;
 902         delete pat;
 903     }
 904
 905
 906     //
 907     //  find, with \G in pattern (true if at the end of a previous match).
 908     //
 909     {
 910         int32_t             flags=0;
 911         UParseError         pe;
 912         UErrorCode          status=U_ZERO_ERROR;
 913
 914         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
 915         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 916         REGEX_CHECK_STATUS;
 917         UnicodeString data = ".abcabc.abc..";
 918         //                    012345678901234567
 919
 920         RegexMatcher *matcher = pat->matcher(data, status);
 921         REGEX_CHECK_STATUS;
 922         REGEX_ASSERT(matcher->find());
 923         REGEX_ASSERT(matcher->start(status) == 0);
 924         REGEX_ASSERT(matcher->start(1, status) == -1);
 925         REGEX_ASSERT(matcher->start(2, status) == 1);
 926
 927         REGEX_ASSERT(matcher->find());
 928         REGEX_ASSERT(matcher->start(status) == 4);
 929         REGEX_ASSERT(matcher->start(1, status) == 4);
 930         REGEX_ASSERT(matcher->start(2, status) == -1);
 931         REGEX_CHECK_STATUS;
 932
 933         delete matcher;
 934         delete pat;
 935     }
 936
 937     //
 938     //   find with zero length matches, match position should bump ahead
 939     //     to prevent loops.
 940     //
 941     {
 942         int32_t                 i;
 943         UErrorCode          status=U_ZERO_ERROR;
 944         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
 945                                                       //   using an always-true look-ahead.
 946         REGEX_CHECK_STATUS;
 947         UnicodeString s("    ");
 948         m.reset(s);
 949         for (i=0; ; i++) {
 950             if (m.find() == FALSE) {
 951                 break;
 952             }
 953             REGEX_ASSERT(m.start(status) == i);
 954             REGEX_ASSERT(m.end(status) == i);
 955         }
 956         REGEX_ASSERT(i==5);
 957
 958         // Check that the bump goes over surrogate pairs OK
 959         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
 960         s = s.unescape();
 961         m.reset(s);
 962         for (i=0; ; i+=2) {
 963             if (m.find() == FALSE) {
 964                 break;
 965             }
 966             REGEX_ASSERT(m.start(status) == i);
 967             REGEX_ASSERT(m.end(status) == i);
 968         }
 969         REGEX_ASSERT(i==10);
 970     }
 971     {
 972         // find() loop breaking test.
 973         //        with pattern of /.?/, should see a series of one char matches, then a single
 974         //        match of zero length at the end of the input string.
 975         int32_t                 i;
 976         UErrorCode          status=U_ZERO_ERROR;
 977         RegexMatcher        m(".?", 0, status);
 978         REGEX_CHECK_STATUS;
 979         UnicodeString s("    ");
 980         m.reset(s);
 981         for (i=0; ; i++) {
 982             if (m.find() == FALSE) {
 983                 break;
 984             }
 985             REGEX_ASSERT(m.start(status) == i);
 986             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
 987         }
 988         REGEX_ASSERT(i==5);
 989     }
 990
 991
 992     //
 993     // Matchers with no input string behave as if they had an empty input string.
 994     //
 995
 996     {
 997         UErrorCode status = U_ZERO_ERROR;
 998         RegexMatcher  m(".?", 0, status);
 999         REGEX_CHECK_STATUS;
1000         REGEX_ASSERT(m.find());
1001         REGEX_ASSERT(m.start(status) == 0);
1002         REGEX_ASSERT(m.input() == "");
1003     }
1004     {
1005         UErrorCode status = U_ZERO_ERROR;
1006         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1007         RegexMatcher  *m = p->matcher(status);
1008         REGEX_CHECK_STATUS;
1009
1010         REGEX_ASSERT(m->find() == FALSE);
1011         REGEX_ASSERT(m->input() == "");
1012         delete m;
1013         delete p;
1014     }
1015
1016     //
1017     // Regions
1018     //
1019     {
1020         UErrorCode status = U_ZERO_ERROR;
1021         UnicodeString testString("This is test data");
1022         RegexMatcher m(".*", testString,  0, status);
1023         REGEX_CHECK_STATUS;
1024         REGEX_ASSERT(m.regionStart() == 0);
1025         REGEX_ASSERT(m.regionEnd() == testString.length());
1026         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1027         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1028
1029         m.region(2,4, status);
1030         REGEX_CHECK_STATUS;
1031         REGEX_ASSERT(m.matches(status));
1032         REGEX_ASSERT(m.start(status)==2);
1033         REGEX_ASSERT(m.end(status)==4);
1034         REGEX_CHECK_STATUS;
1035
1036         m.reset();
1037         REGEX_ASSERT(m.regionStart() == 0);
1038         REGEX_ASSERT(m.regionEnd() == testString.length());
1039
1040         UnicodeString shorterString("short");
1041         m.reset(shorterString);
1042         REGEX_ASSERT(m.regionStart() == 0);
1043         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1044
1045         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1046         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1047         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1048         REGEX_ASSERT(&m == &m.reset());
1049         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1050
1051         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1052         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1053         REGEX_ASSERT(&m == &m.reset());
1054         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1055
1056         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1057         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1058         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1059         REGEX_ASSERT(&m == &m.reset());
1060         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1061
1062         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1063         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1064         REGEX_ASSERT(&m == &m.reset());
1065         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1066
1067     }
1068
1069     //
1070     // hitEnd() and requireEnd()
1071     //
1072     {
1073         UErrorCode status = U_ZERO_ERROR;
1074         UnicodeString testString("aabb");
1075         RegexMatcher m1(".*", testString,  0, status);
1076         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1077         REGEX_ASSERT(m1.hitEnd() == TRUE);
1078         REGEX_ASSERT(m1.requireEnd() == FALSE);
1079         REGEX_CHECK_STATUS;
1080
1081         status = U_ZERO_ERROR;
1082         RegexMatcher m2("a*", testString, 0, status);
1083         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1084         REGEX_ASSERT(m2.hitEnd() == FALSE);
1085         REGEX_ASSERT(m2.requireEnd() == FALSE);
1086         REGEX_CHECK_STATUS;
1087
1088         status = U_ZERO_ERROR;
1089         RegexMatcher m3(".*$", testString, 0, status);
1090         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1091         REGEX_ASSERT(m3.hitEnd() == TRUE);
1092         REGEX_ASSERT(m3.requireEnd() == TRUE);
1093         REGEX_CHECK_STATUS;
1094     }
1095
1096
1097     //
1098     // Compilation error on reset with UChar *
1099     //   These were a hazard that people were stumbling over with runtime errors.
1100     //   Changed them to compiler errors by adding private methods that more closely
1101     //   matched the incorrect use of the functions.
1102     //
1103 #if 0
1104     {
1105         UErrorCode status = U_ZERO_ERROR;
1106         UChar ucharString[20];
1107         RegexMatcher m(".", 0, status);
1108         m.reset(ucharString);  // should not compile.
1109
1110         RegexPattern *p = RegexPattern::compile(".", 0, status);
1111         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1112
1113         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1114     }
1115 #endif
1116
1117     //
1118     //  Time Outs.
1119     //       Note:  These tests will need to be changed when the regexp engine is
1120     //              able to detect and cut short the exponential time behavior on
1121     //              this type of match.
1122     //
1123     {
1124         UErrorCode status = U_ZERO_ERROR;
1125         //    Enough 'a's in the string to cause the match to time out.
1126         //       (Each on additonal 'a' doubles the time)
1127         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1128         RegexMatcher matcher("(a+)+b", testString, 0, status);
1129         REGEX_CHECK_STATUS;
1130         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1131         matcher.setTimeLimit(100, status);
1132         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1133         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1134         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1135     }
1136     {
1137         UErrorCode status = U_ZERO_ERROR;
1138         //   Few enough 'a's to slip in under the time limit.
1139         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1140         RegexMatcher matcher("(a+)+b", testString, 0, status);
1141         REGEX_CHECK_STATUS;
1142         matcher.setTimeLimit(100, status);
1143         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1144         REGEX_CHECK_STATUS;
1145     }
1146
1147     //
1148     //  Stack Limits
1149     //
1150     {
1151         UErrorCode status = U_ZERO_ERROR;
1152         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1153
1154         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1155         //   of the '+', and makes the stack frames larger.
1156         RegexMatcher matcher("(A)+A$", testString, 0, status);
1157
1158         // With the default stack, this match should fail to run
1159         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1160         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1161
1162         // With unlimited stack, it should run
1163         status = U_ZERO_ERROR;
1164         matcher.setStackLimit(0, status);
1165         REGEX_CHECK_STATUS;
1166         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1167         REGEX_CHECK_STATUS;
1168         REGEX_ASSERT(matcher.getStackLimit() == 0);
1169
1170         // With a limited stack, it the match should fail
1171         status = U_ZERO_ERROR;
1172         matcher.setStackLimit(10000, status);
1173         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1174         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1175         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1176     }
1177
1178         // A pattern that doesn't save state should work with
1179         //   a minimal sized stack
1180     {
1181         UErrorCode status = U_ZERO_ERROR;
1182         UnicodeString testString = "abc";
1183         RegexMatcher matcher("abc", testString, 0, status);
1184         REGEX_CHECK_STATUS;
1185         matcher.setStackLimit(30, status);
1186         REGEX_CHECK_STATUS;
1187         REGEX_ASSERT(matcher.matches(status) == TRUE);
1188         REGEX_CHECK_STATUS;
1189         REGEX_ASSERT(matcher.getStackLimit() == 30);
1190
1191         // Negative stack sizes should fail
1192         status = U_ZERO_ERROR;
1193         matcher.setStackLimit(1000, status);
1194         REGEX_CHECK_STATUS;
1195         matcher.setStackLimit(-1, status);
1196         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1197         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1198     }
1199
1200
1201 }
1202
1203
1204
1205
1206
1207
1208 //---------------------------------------------------------------------------
1209 //
1210 //      API_Replace        API test for class RegexMatcher, testing the
1211 //                         Replace family of functions.
1212 //
1213 //---------------------------------------------------------------------------
1214 void RegexTest::API_Replace() {
1215     //
1216     //  Replace
1217     //
1218     int32_t             flags=0;
1219     UParseError         pe;
1220     UErrorCode          status=U_ZERO_ERROR;
1221
1222     UnicodeString       re("abc");
1223     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1224     REGEX_CHECK_STATUS;
1225     UnicodeString data = ".abc..abc...abc..";
1226     //                    012345678901234567
1227     RegexMatcher *matcher = pat->matcher(data, status);
1228
1229     //
1230     //  Plain vanilla matches.
1231     //
1232     UnicodeString  dest;
1233     dest = matcher->replaceFirst("yz", status);
1234     REGEX_CHECK_STATUS;
1235     REGEX_ASSERT(dest == ".yz..abc...abc..");
1236
1237     dest = matcher->replaceAll("yz", status);
1238     REGEX_CHECK_STATUS;
1239     REGEX_ASSERT(dest == ".yz..yz...yz..");
1240
1241     //
1242     //  Plain vanilla non-matches.
1243     //
1244     UnicodeString d2 = ".abx..abx...abx..";
1245     matcher->reset(d2);
1246     dest = matcher->replaceFirst("yz", status);
1247     REGEX_CHECK_STATUS;
1248     REGEX_ASSERT(dest == ".abx..abx...abx..");
1249
1250     dest = matcher->replaceAll("yz", status);
1251     REGEX_CHECK_STATUS;
1252     REGEX_ASSERT(dest == ".abx..abx...abx..");
1253
1254     //
1255     // Empty source string
1256     //
1257     UnicodeString d3 = "";
1258     matcher->reset(d3);
1259     dest = matcher->replaceFirst("yz", status);
1260     REGEX_CHECK_STATUS;
1261     REGEX_ASSERT(dest == "");
1262
1263     dest = matcher->replaceAll("yz", status);
1264     REGEX_CHECK_STATUS;
1265     REGEX_ASSERT(dest == "");
1266
1267     //
1268     // Empty substitution string
1269     //
1270     matcher->reset(data);              // ".abc..abc...abc.."
1271     dest = matcher->replaceFirst("", status);
1272     REGEX_CHECK_STATUS;
1273     REGEX_ASSERT(dest == "...abc...abc..");
1274
1275     dest = matcher->replaceAll("", status);
1276     REGEX_CHECK_STATUS;
1277     REGEX_ASSERT(dest == "........");
1278
1279     //
1280     // match whole string
1281     //
1282     UnicodeString d4 = "abc";
1283     matcher->reset(d4);
1284     dest = matcher->replaceFirst("xyz", status);
1285     REGEX_CHECK_STATUS;
1286     REGEX_ASSERT(dest == "xyz");
1287
1288     dest = matcher->replaceAll("xyz", status);
1289     REGEX_CHECK_STATUS;
1290     REGEX_ASSERT(dest == "xyz");
1291
1292     //
1293     // Capture Group, simple case
1294     //
1295     UnicodeString       re2("a(..)");
1296     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1297     REGEX_CHECK_STATUS;
1298     UnicodeString d5 = "abcdefg";
1299     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1300     REGEX_CHECK_STATUS;
1301     dest = matcher2->replaceFirst("$1$1", status);
1302     REGEX_CHECK_STATUS;
1303     REGEX_ASSERT(dest == "bcbcdefg");
1304
1305     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1306     REGEX_CHECK_STATUS;
1307     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1308
1309     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1310     REGEX_CHECK_STATUS;
1311     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1312
1313     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1314     replacement = replacement.unescape();
1315     dest = matcher2->replaceFirst(replacement, status);
1316     REGEX_CHECK_STATUS;
1317     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1318
1319     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1320
1321
1322     //
1323     // Replacement String with \u hex escapes
1324     //
1325     {
1326         UnicodeString  src = "abc 1 abc 2 abc 3";
1327         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1328         matcher->reset(src);
1329         UnicodeString  result = matcher->replaceAll(substitute, status);
1330         REGEX_CHECK_STATUS;
1331         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1332     }
1333     {
1334         UnicodeString  src = "abc !";
1335         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1336         matcher->reset(src);
1337         UnicodeString  result = matcher->replaceAll(substitute, status);
1338         REGEX_CHECK_STATUS;
1339         UnicodeString expected = UnicodeString("--");
1340         expected.append((UChar32)0x10000);
1341         expected.append("-- !");
1342         REGEX_ASSERT(result == expected);
1343     }
1344     // TODO:  need more through testing of capture substitutions.
1345
1346     // Bug 4057
1347     //
1348     {
1349         status = U_ZERO_ERROR;
1350         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1351         RegexMatcher m("ss(.*?)ee", 0, status);
1352         REGEX_CHECK_STATUS;
1353         UnicodeString result;
1354
1355         // Multiple finds do NOT bump up the previous appendReplacement postion.
1356         m.reset(s);
1357         m.find();
1358         m.find();
1359         m.appendReplacement(result, "ooh", status);
1360         REGEX_CHECK_STATUS;
1361         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1362
1363         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1364         status = U_ZERO_ERROR;
1365         result.truncate(0);
1366         m.reset(10, status);
1367         m.find();
1368         m.find();
1369         m.appendReplacement(result, "ooh", status);
1370         REGEX_CHECK_STATUS;
1371         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1372
1373         // find() at interior of string, appendReplacemnt still starts at beginning.
1374         status = U_ZERO_ERROR;
1375         result.truncate(0);
1376         m.reset();
1377         m.find(10, status);
1378         m.find();
1379         m.appendReplacement(result, "ooh", status);
1380         REGEX_CHECK_STATUS;
1381         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1382
1383         m.appendTail(result);
1384         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1385
1386     }
1387
1388     delete matcher2;
1389     delete pat2;
1390     delete matcher;
1391     delete pat;
1392 }
1393
1394
1395 //---------------------------------------------------------------------------
1396 //
1397 //      API_Pattern       Test that the API for class RegexPattern is
1398 //                        present and nominally working.
1399 //
1400 //---------------------------------------------------------------------------
1401 void RegexTest::API_Pattern() {
1402     RegexPattern        pata;    // Test default constructor to not crash.
1403     RegexPattern        patb;
1404
1405     REGEX_ASSERT(pata == patb);
1406     REGEX_ASSERT(pata == pata);
1407
1408     UnicodeString re1("abc[a-l][m-z]");
1409     UnicodeString re2("def");
1410     UErrorCode    status = U_ZERO_ERROR;
1411     UParseError   pe;
1412
1413     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1414     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1415     REGEX_CHECK_STATUS;
1416     REGEX_ASSERT(*pat1 == *pat1);
1417     REGEX_ASSERT(*pat1 != pata);
1418
1419     // Assign
1420     patb = *pat1;
1421     REGEX_ASSERT(patb == *pat1);
1422
1423     // Copy Construct
1424     RegexPattern patc(*pat1);
1425     REGEX_ASSERT(patc == *pat1);
1426     REGEX_ASSERT(patb == patc);
1427     REGEX_ASSERT(pat1 != pat2);
1428     patb = *pat2;
1429     REGEX_ASSERT(patb != patc);
1430     REGEX_ASSERT(patb == *pat2);
1431
1432     // Compile with no flags.
1433     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1434     REGEX_ASSERT(*pat1a == *pat1);
1435
1436     REGEX_ASSERT(pat1a->flags() == 0);
1437
1438     // Compile with different flags should be not equal
1439     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1440     REGEX_CHECK_STATUS;
1441
1442     REGEX_ASSERT(*pat1b != *pat1a);
1443     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1444     REGEX_ASSERT(pat1a->flags() == 0);
1445     delete pat1b;
1446
1447     // clone
1448     RegexPattern *pat1c = pat1->clone();
1449     REGEX_ASSERT(*pat1c == *pat1);
1450     REGEX_ASSERT(*pat1c != *pat2);
1451
1452     delete pat1c;
1453     delete pat1a;
1454     delete pat1;
1455     delete pat2;
1456
1457
1458     //
1459     //   Verify that a matcher created from a cloned pattern works.
1460     //     (Jitterbug 3423)
1461     //
1462     {
1463         UErrorCode     status     = U_ZERO_ERROR;
1464         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1465         RegexPattern  *pClone     = pSource->clone();
1466         delete         pSource;
1467         RegexMatcher  *mFromClone = pClone->matcher(status);
1468         REGEX_CHECK_STATUS;
1469         UnicodeString s = "Hello World";
1470         mFromClone->reset(s);
1471         REGEX_ASSERT(mFromClone->find() == TRUE);
1472         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1473         REGEX_ASSERT(mFromClone->find() == TRUE);
1474         REGEX_ASSERT(mFromClone->group(status) == "World");
1475         REGEX_ASSERT(mFromClone->find() == FALSE);
1476         delete mFromClone;
1477         delete pClone;
1478     }
1479
1480     //
1481     //   matches convenience API
1482     //
1483     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1484     REGEX_CHECK_STATUS;
1485     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1486     REGEX_CHECK_STATUS;
1487     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1488     REGEX_CHECK_STATUS;
1489     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1490     REGEX_CHECK_STATUS;
1491     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1492     REGEX_CHECK_STATUS;
1493     status = U_INDEX_OUTOFBOUNDS_ERROR;
1494     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1495     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1496
1497
1498     //
1499     // Split()
1500     //
1501     status = U_ZERO_ERROR;
1502     pat1 = RegexPattern::compile(" +",  pe, status);
1503     REGEX_CHECK_STATUS;
1504     UnicodeString  fields[10];
1505
1506     int32_t n;
1507     n = pat1->split("Now is the time", fields, 10, status);
1508     REGEX_CHECK_STATUS;
1509     REGEX_ASSERT(n==4);
1510     REGEX_ASSERT(fields[0]=="Now");
1511     REGEX_ASSERT(fields[1]=="is");
1512     REGEX_ASSERT(fields[2]=="the");
1513     REGEX_ASSERT(fields[3]=="time");
1514     REGEX_ASSERT(fields[4]=="");
1515
1516     n = pat1->split("Now is the time", fields, 2, status);
1517     REGEX_CHECK_STATUS;
1518     REGEX_ASSERT(n==2);
1519     REGEX_ASSERT(fields[0]=="Now");
1520     REGEX_ASSERT(fields[1]=="is the time");
1521     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1522
1523     fields[1] = "*";
1524     status = U_ZERO_ERROR;
1525     n = pat1->split("Now is the time", fields, 1, status);
1526     REGEX_CHECK_STATUS;
1527     REGEX_ASSERT(n==1);
1528     REGEX_ASSERT(fields[0]=="Now is the time");
1529     REGEX_ASSERT(fields[1]=="*");
1530     status = U_ZERO_ERROR;
1531
1532     n = pat1->split("    Now       is the time   ", fields, 10, status);
1533     REGEX_CHECK_STATUS;
1534     REGEX_ASSERT(n==5);
1535     REGEX_ASSERT(fields[0]=="");
1536     REGEX_ASSERT(fields[1]=="Now");
1537     REGEX_ASSERT(fields[2]=="is");
1538     REGEX_ASSERT(fields[3]=="the");
1539     REGEX_ASSERT(fields[4]=="time");
1540     REGEX_ASSERT(fields[5]=="");
1541
1542     n = pat1->split("     ", fields, 10, status);
1543     REGEX_CHECK_STATUS;
1544     REGEX_ASSERT(n==1);
1545     REGEX_ASSERT(fields[0]=="");
1546
1547     fields[0] = "foo";
1548     n = pat1->split("", fields, 10, status);
1549     REGEX_CHECK_STATUS;
1550     REGEX_ASSERT(n==0);
1551     REGEX_ASSERT(fields[0]=="foo");
1552
1553     delete pat1;
1554
1555     //  split, with a pattern with (capture)
1556     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1557     REGEX_CHECK_STATUS;
1558
1559     status = U_ZERO_ERROR;
1560     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1561     REGEX_CHECK_STATUS;
1562     REGEX_ASSERT(n==6);
1563     REGEX_ASSERT(fields[0]=="");
1564     REGEX_ASSERT(fields[1]=="a");
1565     REGEX_ASSERT(fields[2]=="Now is ");
1566     REGEX_ASSERT(fields[3]=="b");
1567     REGEX_ASSERT(fields[4]=="the time");
1568     REGEX_ASSERT(fields[5]=="c");
1569     REGEX_ASSERT(fields[6]=="");
1570     REGEX_ASSERT(status==U_ZERO_ERROR);
1571
1572     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1573     REGEX_CHECK_STATUS;
1574     REGEX_ASSERT(n==6);
1575     REGEX_ASSERT(fields[0]=="  ");
1576     REGEX_ASSERT(fields[1]=="a");
1577     REGEX_ASSERT(fields[2]=="Now is ");
1578     REGEX_ASSERT(fields[3]=="b");
1579     REGEX_ASSERT(fields[4]=="the time");
1580     REGEX_ASSERT(fields[5]=="c");
1581     REGEX_ASSERT(fields[6]=="");
1582
1583     status = U_ZERO_ERROR;
1584     fields[6] = "foo";
1585     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1586     REGEX_CHECK_STATUS;
1587     REGEX_ASSERT(n==6);
1588     REGEX_ASSERT(fields[0]=="  ");
1589     REGEX_ASSERT(fields[1]=="a");
1590     REGEX_ASSERT(fields[2]=="Now is ");
1591     REGEX_ASSERT(fields[3]=="b");
1592     REGEX_ASSERT(fields[4]=="the time");
1593     REGEX_ASSERT(fields[5]=="c");
1594     REGEX_ASSERT(fields[6]=="foo");
1595
1596     status = U_ZERO_ERROR;
1597     fields[5] = "foo";
1598     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1599     REGEX_CHECK_STATUS;
1600     REGEX_ASSERT(n==5);
1601     REGEX_ASSERT(fields[0]=="  ");
1602     REGEX_ASSERT(fields[1]=="a");
1603     REGEX_ASSERT(fields[2]=="Now is ");
1604     REGEX_ASSERT(fields[3]=="b");
1605     REGEX_ASSERT(fields[4]=="the time<c>");
1606     REGEX_ASSERT(fields[5]=="foo");
1607
1608     status = U_ZERO_ERROR;
1609     fields[5] = "foo";
1610     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1611     REGEX_CHECK_STATUS;
1612     REGEX_ASSERT(n==5);
1613     REGEX_ASSERT(fields[0]=="  ");
1614     REGEX_ASSERT(fields[1]=="a");
1615     REGEX_ASSERT(fields[2]=="Now is ");
1616     REGEX_ASSERT(fields[3]=="b");
1617     REGEX_ASSERT(fields[4]=="the time");
1618     REGEX_ASSERT(fields[5]=="foo");
1619
1620     status = U_ZERO_ERROR;
1621     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(n==4);
1624     REGEX_ASSERT(fields[0]=="  ");
1625     REGEX_ASSERT(fields[1]=="a");
1626     REGEX_ASSERT(fields[2]=="Now is ");
1627     REGEX_ASSERT(fields[3]=="the time<c>");
1628     status = U_ZERO_ERROR;
1629     delete pat1;
1630
1631     pat1 = RegexPattern::compile("([-,])",  pe, status);
1632     REGEX_CHECK_STATUS;
1633     n = pat1->split("1-10,20", fields, 10, status);
1634     REGEX_CHECK_STATUS;
1635     REGEX_ASSERT(n==5);
1636     REGEX_ASSERT(fields[0]=="1");
1637     REGEX_ASSERT(fields[1]=="-");
1638     REGEX_ASSERT(fields[2]=="10");
1639     REGEX_ASSERT(fields[3]==",");
1640     REGEX_ASSERT(fields[4]=="20");
1641     delete pat1;
1642
1643
1644     //
1645     // RegexPattern::pattern()
1646     //
1647     pat1 = new RegexPattern();
1648     REGEX_ASSERT(pat1->pattern() == "");
1649     delete pat1;
1650
1651     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1652     REGEX_CHECK_STATUS;
1653     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1654     delete pat1;
1655
1656
1657     //
1658     // classID functions
1659     //
1660     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1661     REGEX_CHECK_STATUS;
1662     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1663     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1664     UnicodeString Hello("Hello, world.");
1665     RegexMatcher *m = pat1->matcher(Hello, status);
1666     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1667     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1668     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1669     delete m;
1670     delete pat1;
1671
1672 }
1673
1674 //---------------------------------------------------------------------------
1675 //
1676 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1677 //                       is present and working, but excluding functions
1678 //                       implementing replace operations.
1679 //
1680 //---------------------------------------------------------------------------
1681 void RegexTest::API_Match_UTF8() {
1682     UParseError         pe;
1683     UErrorCode          status=U_ZERO_ERROR;
1684     int32_t             flags = 0;
1685
1686     //
1687     // Debug - slide failing test cases early
1688     //
1689 #if 0
1690     {
1691     }
1692     return;
1693 #endif
1694
1695     //
1696     // Simple pattern compilation
1697     //
1698     {
1699         UText               re = UTEXT_INITIALIZER;
1700         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1701         RegexPattern        *pat2;
1702         pat2 = RegexPattern::compile(&re, flags, pe, status);
1703         REGEX_CHECK_STATUS;
1704
1705         UText input1 = UTEXT_INITIALIZER;
1706         UText input2 = UTEXT_INITIALIZER;
1707         UText empty  = UTEXT_INITIALIZER;
1708         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1709         REGEX_VERBOSE_TEXT(&input1);
1710         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1711         REGEX_VERBOSE_TEXT(&input2);
1712         utext_openUChars(&empty, NULL, 0, &status);
1713
1714         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1715         int32_t input2Len = strlen("not abc");
1716
1717
1718         //
1719         // Matcher creation and reset.
1720         //
1721         RegexMatcher *m1 = pat2->matcher(&input1, RegexPattern::PATTERN_IS_UTEXT, status);
1722         REGEX_CHECK_STATUS;
1723         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1724         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1725         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1726         m1->reset(&input2);
1727         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1728         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1729         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1730         m1->reset(&input1);
1731         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1732         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1733         m1->reset(&empty);
1734         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1735         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1736
1737         //
1738         //  reset(pos, status)
1739         //
1740         m1->reset(&input1);
1741         m1->reset(4, status);
1742         REGEX_CHECK_STATUS;
1743         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1744         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1745
1746         m1->reset(-1, status);
1747         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1748         status = U_ZERO_ERROR;
1749
1750         m1->reset(0, status);
1751         REGEX_CHECK_STATUS;
1752         status = U_ZERO_ERROR;
1753
1754         m1->reset(input1Len-1, status);
1755         REGEX_CHECK_STATUS;
1756         status = U_ZERO_ERROR;
1757
1758         m1->reset(input1Len, status);
1759         REGEX_CHECK_STATUS;
1760         status = U_ZERO_ERROR;
1761
1762         m1->reset(input1Len+1, status);
1763         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1764         status = U_ZERO_ERROR;
1765
1766         //
1767         // match(pos, status)
1768         //
1769         m1->reset(&input2);
1770         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1771         m1->reset();
1772         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1773         m1->reset();
1774         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1775         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1776         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1777         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1778
1779         // Match() at end of string should fail, but should not
1780         //  be an error.
1781         status = U_ZERO_ERROR;
1782         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1783         REGEX_CHECK_STATUS;
1784
1785         // Match beyond end of string should fail with an error.
1786         status = U_ZERO_ERROR;
1787         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1788         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1789
1790         // Successful match at end of string.
1791         {
1792             status = U_ZERO_ERROR;
1793             RegexMatcher m("A?", 0, status);  // will match zero length string.
1794             REGEX_CHECK_STATUS;
1795             m.reset(&input1);
1796             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1797             REGEX_CHECK_STATUS;
1798             m.reset(&empty);
1799             REGEX_ASSERT(m.matches(0, status) == TRUE);
1800             REGEX_CHECK_STATUS;
1801         }
1802
1803
1804         //
1805         // lookingAt(pos, status)
1806         //
1807         status = U_ZERO_ERROR;
1808         m1->reset(&input2);  // "not abc"
1809         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1810         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1811         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1812         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1813         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1815         status = U_ZERO_ERROR;
1816         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1817         REGEX_CHECK_STATUS;
1818         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1819         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1820
1821         delete m1;
1822         delete pat2;
1823
1824         utext_close(&re);
1825         utext_close(&input1);
1826         utext_close(&input2);
1827         utext_close(&empty);
1828     }
1829
1830
1831     //
1832     // Capture Group.
1833     //     RegexMatcher::start();
1834     //     RegexMatcher::end();
1835     //     RegexMatcher::groupCount();
1836     //
1837     {
1838         int32_t             flags=0;
1839         UParseError         pe;
1840         UErrorCode          status=U_ZERO_ERROR;
1841         UText               re=UTEXT_INITIALIZER;
1842         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1843         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1844
1845         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1846         REGEX_CHECK_STATUS;
1847
1848         UText input = UTEXT_INITIALIZER;
1849         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1850         utext_openUTF8(&input, str_0123456789, -1, &status);
1851
1852         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1853         REGEX_CHECK_STATUS;
1854         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1855         static const int32_t matchStarts[] = {0,  2, 4, 8};
1856         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1857         int32_t i;
1858         for (i=0; i<4; i++) {
1859             int32_t actualStart = matcher->start(i, status);
1860             REGEX_CHECK_STATUS;
1861             if (actualStart != matchStarts[i]) {
1862                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1863                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1864             }
1865             int32_t actualEnd = matcher->end(i, status);
1866             REGEX_CHECK_STATUS;
1867             if (actualEnd != matchEnds[i]) {
1868                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1869                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1870             }
1871         }
1872
1873         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1874         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1875
1876         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1877         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1878         matcher->reset();
1879         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1880
1881         matcher->lookingAt(status);
1882
1883         UnicodeString dest;
1884         UText destText = UTEXT_INITIALIZER;
1885         utext_openUnicodeString(&destText, &dest, &status);
1886         UText *result;
1887         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1888         //      Test shallow-clone API
1889         int64_t   group_len;
1890         result = matcher->group((UText *)NULL, group_len, status);
1891         REGEX_CHECK_STATUS;
1892         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1893         utext_close(result);
1894         result = matcher->group(0, &destText, group_len, status);
1895         REGEX_CHECK_STATUS;
1896         REGEX_ASSERT(result == &destText);
1897         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1898         //  destText is now immutable, reopen it
1899         utext_close(&destText);
1900         utext_openUnicodeString(&destText, &dest, &status);
1901
1902         result = matcher->group(0, NULL, status);
1903         REGEX_CHECK_STATUS;
1904         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1905         utext_close(result);
1906         result = matcher->group(0, &destText, status);
1907         REGEX_CHECK_STATUS;
1908         REGEX_ASSERT(result == &destText);
1909         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
1910
1911         result = matcher->group(1, NULL, status);
1912         REGEX_CHECK_STATUS;
1913         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
1914         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1915         utext_close(result);
1916         result = matcher->group(1, &destText, status);
1917         REGEX_CHECK_STATUS;
1918         REGEX_ASSERT(result == &destText);
1919         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
1920
1921         result = matcher->group(2, NULL, status);
1922         REGEX_CHECK_STATUS;
1923         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
1924         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1925         utext_close(result);
1926         result = matcher->group(2, &destText, status);
1927         REGEX_CHECK_STATUS;
1928         REGEX_ASSERT(result == &destText);
1929         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
1930
1931         result = matcher->group(3, NULL, status);
1932         REGEX_CHECK_STATUS;
1933         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
1934         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1935         utext_close(result);
1936         result = matcher->group(3, &destText, status);
1937         REGEX_CHECK_STATUS;
1938         REGEX_ASSERT(result == &destText);
1939         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
1940
1941         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1942         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1943         matcher->reset();
1944         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
1945
1946         delete matcher;
1947         delete pat;
1948
1949         utext_close(&destText);
1950         utext_close(&input);
1951         utext_close(&re);
1952     }
1953
1954     //
1955     //  find
1956     //
1957     {
1958         int32_t             flags=0;
1959         UParseError         pe;
1960         UErrorCode          status=U_ZERO_ERROR;
1961         UText               re=UTEXT_INITIALIZER;
1962         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
1963         utext_openUTF8(&re, str_abc, -1, &status);
1964
1965         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1966         REGEX_CHECK_STATUS;
1967         UText input = UTEXT_INITIALIZER;
1968         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
1969         utext_openUTF8(&input, str_abcabcabc, -1, &status);
1970         //                      012345678901234567
1971
1972         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
1973         REGEX_CHECK_STATUS;
1974         REGEX_ASSERT(matcher->find());
1975         REGEX_ASSERT(matcher->start(status) == 1);
1976         REGEX_ASSERT(matcher->find());
1977         REGEX_ASSERT(matcher->start(status) == 6);
1978         REGEX_ASSERT(matcher->find());
1979         REGEX_ASSERT(matcher->start(status) == 12);
1980         REGEX_ASSERT(matcher->find() == FALSE);
1981         REGEX_ASSERT(matcher->find() == FALSE);
1982
1983         matcher->reset();
1984         REGEX_ASSERT(matcher->find());
1985         REGEX_ASSERT(matcher->start(status) == 1);
1986
1987         REGEX_ASSERT(matcher->find(0, status));
1988         REGEX_ASSERT(matcher->start(status) == 1);
1989         REGEX_ASSERT(matcher->find(1, status));
1990         REGEX_ASSERT(matcher->start(status) == 1);
1991         REGEX_ASSERT(matcher->find(2, status));
1992         REGEX_ASSERT(matcher->start(status) == 6);
1993         REGEX_ASSERT(matcher->find(12, status));
1994         REGEX_ASSERT(matcher->start(status) == 12);
1995         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1996         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1997         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1998         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1999
2000         status = U_ZERO_ERROR;
2001         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2002         status = U_ZERO_ERROR;
2003         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2004
2005         REGEX_ASSERT(matcher->groupCount() == 0);
2006
2007         delete matcher;
2008         delete pat;
2009
2010         utext_close(&input);
2011         utext_close(&re);
2012     }
2013
2014
2015     //
2016     //  find, with \G in pattern (true if at the end of a previous match).
2017     //
2018     {
2019         int32_t             flags=0;
2020         UParseError         pe;
2021         UErrorCode          status=U_ZERO_ERROR;
2022         UText               re=UTEXT_INITIALIZER;
2023         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2024         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2025
2026         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2027
2028         REGEX_CHECK_STATUS;
2029         UText input = UTEXT_INITIALIZER;
2030         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2031         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2032         //                      012345678901234567
2033
2034         RegexMatcher *matcher = pat->matcher(&input, RegexPattern::PATTERN_IS_UTEXT, status);
2035         REGEX_CHECK_STATUS;
2036         REGEX_ASSERT(matcher->find());
2037         REGEX_ASSERT(matcher->start(status) == 0);
2038         REGEX_ASSERT(matcher->start(1, status) == -1);
2039         REGEX_ASSERT(matcher->start(2, status) == 1);
2040
2041         REGEX_ASSERT(matcher->find());
2042         REGEX_ASSERT(matcher->start(status) == 4);
2043         REGEX_ASSERT(matcher->start(1, status) == 4);
2044         REGEX_ASSERT(matcher->start(2, status) == -1);
2045         REGEX_CHECK_STATUS;
2046
2047         delete matcher;
2048         delete pat;
2049
2050         utext_close(&input);
2051         utext_close(&re);
2052     }
2053
2054     //
2055     //   find with zero length matches, match position should bump ahead
2056     //     to prevent loops.
2057     //
2058     {
2059         int32_t                 i;
2060         UErrorCode          status=U_ZERO_ERROR;
2061         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2062                                                       //   using an always-true look-ahead.
2063         REGEX_CHECK_STATUS;
2064         UText s = UTEXT_INITIALIZER;
2065         utext_openUTF8(&s, "    ", -1, &status);
2066         m.reset(&s);
2067         for (i=0; ; i++) {
2068             if (m.find() == FALSE) {
2069                 break;
2070             }
2071             REGEX_ASSERT(m.start(status) == i);
2072             REGEX_ASSERT(m.end(status) == i);
2073         }
2074         REGEX_ASSERT(i==5);
2075
2076         // Check that the bump goes over characters outside the BMP OK
2077         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2078         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2079         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2080         m.reset(&s);
2081         for (i=0; ; i+=4) {
2082             if (m.find() == FALSE) {
2083                 break;
2084             }
2085             REGEX_ASSERT(m.start(status) == i);
2086             REGEX_ASSERT(m.end(status) == i);
2087         }
2088         REGEX_ASSERT(i==20);
2089
2090         utext_close(&s);
2091     }
2092     {
2093         // find() loop breaking test.
2094         //        with pattern of /.?/, should see a series of one char matches, then a single
2095         //        match of zero length at the end of the input string.
2096         int32_t                 i;
2097         UErrorCode          status=U_ZERO_ERROR;
2098         RegexMatcher        m(".?", 0, status);
2099         REGEX_CHECK_STATUS;
2100         UText s = UTEXT_INITIALIZER;
2101         utext_openUTF8(&s, "    ", -1, &status);
2102         m.reset(&s);
2103         for (i=0; ; i++) {
2104             if (m.find() == FALSE) {
2105                 break;
2106             }
2107             REGEX_ASSERT(m.start(status) == i);
2108             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2109         }
2110         REGEX_ASSERT(i==5);
2111
2112         utext_close(&s);
2113     }
2114
2115
2116     //
2117     // Matchers with no input string behave as if they had an empty input string.
2118     //
2119
2120     {
2121         UErrorCode status = U_ZERO_ERROR;
2122         RegexMatcher  m(".?", 0, status);
2123         REGEX_CHECK_STATUS;
2124         REGEX_ASSERT(m.find());
2125         REGEX_ASSERT(m.start(status) == 0);
2126         REGEX_ASSERT(m.input() == "");
2127     }
2128     {
2129         UErrorCode status = U_ZERO_ERROR;
2130         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2131         RegexMatcher  *m = p->matcher(status);
2132         REGEX_CHECK_STATUS;
2133
2134         REGEX_ASSERT(m->find() == FALSE);
2135         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2136         delete m;
2137         delete p;
2138     }
2139
2140     //
2141     // Regions
2142     //
2143     {
2144         UErrorCode status = U_ZERO_ERROR;
2145         UText testPattern = UTEXT_INITIALIZER;
2146         UText testText    = UTEXT_INITIALIZER;
2147         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2148         REGEX_VERBOSE_TEXT(&testPattern);
2149         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2150         REGEX_VERBOSE_TEXT(&testText);
2151
2152         RegexMatcher m(&testPattern, &testText, 0, status);
2153         REGEX_CHECK_STATUS;
2154         REGEX_ASSERT(m.regionStart() == 0);
2155         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2156         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2157         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2158
2159         m.region(2,4, status);
2160         REGEX_CHECK_STATUS;
2161         REGEX_ASSERT(m.matches(status));
2162         REGEX_ASSERT(m.start(status)==2);
2163         REGEX_ASSERT(m.end(status)==4);
2164         REGEX_CHECK_STATUS;
2165
2166         m.reset();
2167         REGEX_ASSERT(m.regionStart() == 0);
2168         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2169
2170         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2171         REGEX_VERBOSE_TEXT(&testText);
2172         m.reset(&testText);
2173         REGEX_ASSERT(m.regionStart() == 0);
2174         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2175
2176         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2177         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2178         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2179         REGEX_ASSERT(&m == &m.reset());
2180         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2181
2182         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2183         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2184         REGEX_ASSERT(&m == &m.reset());
2185         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2186
2187         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2188         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2189         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2190         REGEX_ASSERT(&m == &m.reset());
2191         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2192
2193         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2194         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2195         REGEX_ASSERT(&m == &m.reset());
2196         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2197
2198         utext_close(&testText);
2199         utext_close(&testPattern);
2200     }
2201
2202     //
2203     // hitEnd() and requireEnd()
2204     //
2205     {
2206         UErrorCode status = U_ZERO_ERROR;
2207         UText testPattern = UTEXT_INITIALIZER;
2208         UText testText    = UTEXT_INITIALIZER;
2209         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2210         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2211         utext_openUTF8(&testPattern, str_, -1, &status);
2212         utext_openUTF8(&testText, str_aabb, -1, &status);
2213
2214         RegexMatcher m1(&testPattern, &testText,  0, status);
2215         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2216         REGEX_ASSERT(m1.hitEnd() == TRUE);
2217         REGEX_ASSERT(m1.requireEnd() == FALSE);
2218         REGEX_CHECK_STATUS;
2219
2220         status = U_ZERO_ERROR;
2221         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2222         utext_openUTF8(&testPattern, str_a, -1, &status);
2223         RegexMatcher m2(&testPattern, &testText, 0, status);
2224         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2225         REGEX_ASSERT(m2.hitEnd() == FALSE);
2226         REGEX_ASSERT(m2.requireEnd() == FALSE);
2227         REGEX_CHECK_STATUS;
2228
2229         status = U_ZERO_ERROR;
2230         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2231         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2232         RegexMatcher m3(&testPattern, &testText, 0, status);
2233         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2234         REGEX_ASSERT(m3.hitEnd() == TRUE);
2235         REGEX_ASSERT(m3.requireEnd() == TRUE);
2236         REGEX_CHECK_STATUS;
2237
2238         utext_close(&testText);
2239         utext_close(&testPattern);
2240     }
2241 }
2242
2243
2244 //---------------------------------------------------------------------------
2245 //
2246 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2247 //                         Replace family of functions.
2248 //
2249 //---------------------------------------------------------------------------
2250 void RegexTest::API_Replace_UTF8() {
2251     //
2252     //  Replace
2253     //
2254     int32_t             flags=0;
2255     UParseError         pe;
2256     UErrorCode          status=U_ZERO_ERROR;
2257
2258     UText               re=UTEXT_INITIALIZER;
2259     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2260     REGEX_VERBOSE_TEXT(&re);
2261     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2262     REGEX_CHECK_STATUS;
2263
2264     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2265     //             012345678901234567
2266     UText dataText = UTEXT_INITIALIZER;
2267     utext_openUTF8(&dataText, data, -1, &status);
2268     REGEX_CHECK_STATUS;
2269     REGEX_VERBOSE_TEXT(&dataText);
2270     RegexMatcher *matcher = pat->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2271
2272     //
2273     //  Plain vanilla matches.
2274     //
2275     UnicodeString  dest;
2276     UText destText = UTEXT_INITIALIZER;
2277     utext_openUnicodeString(&destText, &dest, &status);
2278     UText *result;
2279
2280     UText replText = UTEXT_INITIALIZER;
2281
2282     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2283     utext_openUTF8(&replText, str_yz, -1, &status);
2284     REGEX_VERBOSE_TEXT(&replText);
2285     result = matcher->replaceFirst(&replText, NULL, status);
2286     REGEX_CHECK_STATUS;
2287     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2288     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2289     utext_close(result);
2290     result = matcher->replaceFirst(&replText, &destText, status);
2291     REGEX_CHECK_STATUS;
2292     REGEX_ASSERT(result == &destText);
2293     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2294
2295     result = matcher->replaceAll(&replText, NULL, status);
2296     REGEX_CHECK_STATUS;
2297     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2298     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2299     utext_close(result);
2300
2301     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2302     result = matcher->replaceAll(&replText, &destText, status);
2303     REGEX_CHECK_STATUS;
2304     REGEX_ASSERT(result == &destText);
2305     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2306
2307     //
2308     //  Plain vanilla non-matches.
2309     //
2310     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2311     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2312     matcher->reset(&dataText);
2313
2314     result = matcher->replaceFirst(&replText, NULL, status);
2315     REGEX_CHECK_STATUS;
2316     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2317     utext_close(result);
2318     result = matcher->replaceFirst(&replText, &destText, status);
2319     REGEX_CHECK_STATUS;
2320     REGEX_ASSERT(result == &destText);
2321     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2322
2323     result = matcher->replaceAll(&replText, NULL, status);
2324     REGEX_CHECK_STATUS;
2325     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2326     utext_close(result);
2327     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2328     result = matcher->replaceAll(&replText, &destText, status);
2329     REGEX_CHECK_STATUS;
2330     REGEX_ASSERT(result == &destText);
2331     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2332
2333     //
2334     // Empty source string
2335     //
2336     utext_openUTF8(&dataText, NULL, 0, &status);
2337     matcher->reset(&dataText);
2338
2339     result = matcher->replaceFirst(&replText, NULL, status);
2340     REGEX_CHECK_STATUS;
2341     REGEX_ASSERT_UTEXT_UTF8("", result);
2342     utext_close(result);
2343     result = matcher->replaceFirst(&replText, &destText, status);
2344     REGEX_CHECK_STATUS;
2345     REGEX_ASSERT(result == &destText);
2346     REGEX_ASSERT_UTEXT_UTF8("", result);
2347
2348     result = matcher->replaceAll(&replText, NULL, status);
2349     REGEX_CHECK_STATUS;
2350     REGEX_ASSERT_UTEXT_UTF8("", result);
2351     utext_close(result);
2352     result = matcher->replaceAll(&replText, &destText, status);
2353     REGEX_CHECK_STATUS;
2354     REGEX_ASSERT(result == &destText);
2355     REGEX_ASSERT_UTEXT_UTF8("", result);
2356
2357     //
2358     // Empty substitution string
2359     //
2360     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2361     matcher->reset(&dataText);
2362
2363     utext_openUTF8(&replText, NULL, 0, &status);
2364     result = matcher->replaceFirst(&replText, NULL, status);
2365     REGEX_CHECK_STATUS;
2366     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2367     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2368     utext_close(result);
2369     result = matcher->replaceFirst(&replText, &destText, status);
2370     REGEX_CHECK_STATUS;
2371     REGEX_ASSERT(result == &destText);
2372     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2373
2374     result = matcher->replaceAll(&replText, NULL, status);
2375     REGEX_CHECK_STATUS;
2376     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2377     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2378     utext_close(result);
2379     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2380     result = matcher->replaceAll(&replText, &destText, status);
2381     REGEX_CHECK_STATUS;
2382     REGEX_ASSERT(result == &destText);
2383     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2384
2385     //
2386     // match whole string
2387     //
2388     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2389     utext_openUTF8(&dataText, str_abc, -1, &status);
2390     matcher->reset(&dataText);
2391
2392     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2393     utext_openUTF8(&replText, str_xyz, -1, &status);
2394     result = matcher->replaceFirst(&replText, NULL, status);
2395     REGEX_CHECK_STATUS;
2396     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2397     utext_close(result);
2398     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2399     result = matcher->replaceFirst(&replText, &destText, status);
2400     REGEX_CHECK_STATUS;
2401     REGEX_ASSERT(result == &destText);
2402     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2403
2404     result = matcher->replaceAll(&replText, NULL, status);
2405     REGEX_CHECK_STATUS;
2406     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2407     utext_close(result);
2408     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2409     result = matcher->replaceAll(&replText, &destText, status);
2410     REGEX_CHECK_STATUS;
2411     REGEX_ASSERT(result == &destText);
2412     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2413
2414     //
2415     // Capture Group, simple case
2416     //
2417     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2418     utext_openUTF8(&re, str_add, -1, &status);
2419     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2420     REGEX_CHECK_STATUS;
2421
2422     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2423     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2424     RegexMatcher *matcher2 = pat2->matcher(&dataText, RegexPattern::PATTERN_IS_UTEXT, status);
2425     REGEX_CHECK_STATUS;
2426
2427     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2428     utext_openUTF8(&replText, str_11, -1, &status);
2429     result = matcher2->replaceFirst(&replText, NULL, status);
2430     REGEX_CHECK_STATUS;
2431     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2432     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2433     utext_close(result);
2434     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2435     result = matcher2->replaceFirst(&replText, &destText, status);
2436     REGEX_CHECK_STATUS;
2437     REGEX_ASSERT(result == &destText);
2438     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2439
2440     regextst_openUTF8FromInvariant(&replText, "The value of \\$1 is $1.", -1, &status);
2441     result = matcher2->replaceFirst(&replText, NULL, status);
2442     REGEX_CHECK_STATUS;
2443     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2444     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2445     utext_close(result);
2446     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2447     result = matcher2->replaceFirst(&replText, &destText, status);
2448     REGEX_CHECK_STATUS;
2449     REGEX_ASSERT(result == &destText);
2450     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2451
2452     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2453     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2454     result = matcher2->replaceFirst(&replText, NULL, status);
2455     REGEX_CHECK_STATUS;
2456     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2457     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2458     utext_close(result);
2459     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2460     result = matcher2->replaceFirst(&replText, &destText, status);
2461     REGEX_CHECK_STATUS;
2462     REGEX_ASSERT(result == &destText);
2463     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2464
2465     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2466     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2467     //                                 012345678901234567890123456
2468     supplDigitChars[22] = 0xF0;
2469     supplDigitChars[23] = 0x9D;
2470     supplDigitChars[24] = 0x9F;
2471     supplDigitChars[25] = 0x8F;
2472     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2473
2474     result = matcher2->replaceFirst(&replText, NULL, status);
2475     REGEX_CHECK_STATUS;
2476     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2477     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2478     utext_close(result);
2479     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2480     result = matcher2->replaceFirst(&replText, &destText, status);
2481     REGEX_CHECK_STATUS;
2482     REGEX_ASSERT(result == &destText);
2483     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2484     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2485     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2486     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2487 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2488     utext_close(result);
2489     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2490     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2491     REGEX_ASSERT(result == &destText);
2492 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2493
2494     //
2495     // Replacement String with \u hex escapes
2496     //
2497     {
2498       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2499       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2500         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2501         utext_openUTF8(&replText, str_u0043, -1, &status);
2502         matcher->reset(&dataText);
2503
2504         result = matcher->replaceAll(&replText, NULL, status);
2505         REGEX_CHECK_STATUS;
2506         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2507         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2508         utext_close(result);
2509         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2510         result = matcher->replaceAll(&replText, &destText, status);
2511         REGEX_CHECK_STATUS;
2512         REGEX_ASSERT(result == &destText);
2513         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2514     }
2515     {
2516       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2517         utext_openUTF8(&dataText, str_abc, -1, &status);
2518         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2519         utext_openUTF8(&replText, str_U00010000, -1, &status);
2520         matcher->reset(&dataText);
2521
2522         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2523         //                          0123456789
2524         expected[2] = 0xF0;
2525         expected[3] = 0x90;
2526         expected[4] = 0x80;
2527         expected[5] = 0x80;
2528
2529         result = matcher->replaceAll(&replText, NULL, status);
2530         REGEX_CHECK_STATUS;
2531         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2532         utext_close(result);
2533         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2534         result = matcher->replaceAll(&replText, &destText, status);
2535         REGEX_CHECK_STATUS;
2536         REGEX_ASSERT(result == &destText);
2537         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2538     }
2539     // TODO:  need more through testing of capture substitutions.
2540
2541     // Bug 4057
2542     //
2543     {
2544         status = U_ZERO_ERROR;
2545 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2546 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2547 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2548         utext_openUTF8(&re, str_ssee, -1, &status);
2549         utext_openUTF8(&dataText, str_blah, -1, &status);
2550         utext_openUTF8(&replText, str_ooh, -1, &status);
2551
2552         RegexMatcher m(&re, 0, status);
2553         REGEX_CHECK_STATUS;
2554
2555         UnicodeString result;
2556         UText resultText = UTEXT_INITIALIZER;
2557         utext_openUnicodeString(&resultText, &result, &status);
2558
2559         // Multiple finds do NOT bump up the previous appendReplacement postion.
2560         m.reset(&dataText);
2561         m.find();
2562         m.find();
2563         m.appendReplacement(&resultText, &replText, status);
2564         REGEX_CHECK_STATUS;
2565         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2566         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2567
2568         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2569         status = U_ZERO_ERROR;
2570         result.truncate(0);
2571         utext_openUnicodeString(&resultText, &result, &status);
2572         m.reset(10, status);
2573         m.find();
2574         m.find();
2575         m.appendReplacement(&resultText, &replText, status);
2576         REGEX_CHECK_STATUS;
2577         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2578         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2579
2580         // find() at interior of string, appendReplacement still starts at beginning.
2581         status = U_ZERO_ERROR;
2582         result.truncate(0);
2583         utext_openUnicodeString(&resultText, &result, &status);
2584         m.reset();
2585         m.find(10, status);
2586         m.find();
2587         m.appendReplacement(&resultText, &replText, status);
2588         REGEX_CHECK_STATUS;
2589         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2590         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2591
2592         m.appendTail(&resultText, status);
2593         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2594         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2595
2596         utext_close(&resultText);
2597     }
2598
2599     delete matcher2;
2600     delete pat2;
2601     delete matcher;
2602     delete pat;
2603
2604     utext_close(&dataText);
2605     utext_close(&replText);
2606     utext_close(&destText);
2607     utext_close(&re);
2608 }
2609
2610
2611 //---------------------------------------------------------------------------
2612 //
2613 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2614 //                        present and nominally working.
2615 //
2616 //---------------------------------------------------------------------------
2617 void RegexTest::API_Pattern_UTF8() {
2618     RegexPattern        pata;    // Test default constructor to not crash.
2619     RegexPattern        patb;
2620
2621     REGEX_ASSERT(pata == patb);
2622     REGEX_ASSERT(pata == pata);
2623
2624     UText         re1 = UTEXT_INITIALIZER;
2625     UText         re2 = UTEXT_INITIALIZER;
2626     UErrorCode    status = U_ZERO_ERROR;
2627     UParseError   pe;
2628
2629     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2630     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2631     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2632     utext_openUTF8(&re2, str_def, -1, &status);
2633
2634     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2635     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2636     REGEX_CHECK_STATUS;
2637     REGEX_ASSERT(*pat1 == *pat1);
2638     REGEX_ASSERT(*pat1 != pata);
2639
2640     // Assign
2641     patb = *pat1;
2642     REGEX_ASSERT(patb == *pat1);
2643
2644     // Copy Construct
2645     RegexPattern patc(*pat1);
2646     REGEX_ASSERT(patc == *pat1);
2647     REGEX_ASSERT(patb == patc);
2648     REGEX_ASSERT(pat1 != pat2);
2649     patb = *pat2;
2650     REGEX_ASSERT(patb != patc);
2651     REGEX_ASSERT(patb == *pat2);
2652
2653     // Compile with no flags.
2654     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2655     REGEX_ASSERT(*pat1a == *pat1);
2656
2657     REGEX_ASSERT(pat1a->flags() == 0);
2658
2659     // Compile with different flags should be not equal
2660     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2661     REGEX_CHECK_STATUS;
2662
2663     REGEX_ASSERT(*pat1b != *pat1a);
2664     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2665     REGEX_ASSERT(pat1a->flags() == 0);
2666     delete pat1b;
2667
2668     // clone
2669     RegexPattern *pat1c = pat1->clone();
2670     REGEX_ASSERT(*pat1c == *pat1);
2671     REGEX_ASSERT(*pat1c != *pat2);
2672
2673     delete pat1c;
2674     delete pat1a;
2675     delete pat1;
2676     delete pat2;
2677
2678     utext_close(&re1);
2679     utext_close(&re2);
2680
2681
2682     //
2683     //   Verify that a matcher created from a cloned pattern works.
2684     //     (Jitterbug 3423)
2685     //
2686     {
2687         UErrorCode     status     = U_ZERO_ERROR;
2688         UText          pattern    = UTEXT_INITIALIZER;
2689         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2690         utext_openUTF8(&pattern, str_pL, -1, &status);
2691
2692         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2693         RegexPattern  *pClone     = pSource->clone();
2694         delete         pSource;
2695         RegexMatcher  *mFromClone = pClone->matcher(status);
2696         REGEX_CHECK_STATUS;
2697
2698         UText          input      = UTEXT_INITIALIZER;
2699         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2700         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2701         mFromClone->reset(&input);
2702         REGEX_ASSERT(mFromClone->find() == TRUE);
2703         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2704         REGEX_ASSERT(mFromClone->find() == TRUE);
2705         REGEX_ASSERT(mFromClone->group(status) == "World");
2706         REGEX_ASSERT(mFromClone->find() == FALSE);
2707         delete mFromClone;
2708         delete pClone;
2709
2710         utext_close(&input);
2711         utext_close(&pattern);
2712     }
2713
2714     //
2715     //   matches convenience API
2716     //
2717     {
2718         UErrorCode status  = U_ZERO_ERROR;
2719         UText      pattern = UTEXT_INITIALIZER;
2720         UText      input   = UTEXT_INITIALIZER;
2721
2722         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2723         utext_openUTF8(&input, str_randominput, -1, &status);
2724
2725         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2726         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2727         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2728         REGEX_CHECK_STATUS;
2729
2730         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2731         utext_openUTF8(&pattern, str_abc, -1, &status);
2732         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2733         REGEX_CHECK_STATUS;
2734
2735         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2736         utext_openUTF8(&pattern, str_nput, -1, &status);
2737         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2738         REGEX_CHECK_STATUS;
2739
2740         utext_openUTF8(&pattern, str_randominput, -1, &status);
2741         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2742         REGEX_CHECK_STATUS;
2743
2744         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2745         utext_openUTF8(&pattern, str_u, -1, &status);
2746         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2747         REGEX_CHECK_STATUS;
2748
2749         utext_openUTF8(&input, str_abc, -1, &status);
2750         utext_openUTF8(&pattern, str_abc, -1, &status);
2751         status = U_INDEX_OUTOFBOUNDS_ERROR;
2752         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2753         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2754
2755         utext_close(&input);
2756         utext_close(&pattern);
2757     }
2758
2759
2760     //
2761     // Split()
2762     //
2763     status = U_ZERO_ERROR;
2764     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2765     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2766     pat1 = RegexPattern::compile(&re1, pe, status);
2767     REGEX_CHECK_STATUS;
2768     UnicodeString  fields[10];
2769
2770     int32_t n;
2771     n = pat1->split("Now is the time", fields, 10, status);
2772     REGEX_CHECK_STATUS;
2773     REGEX_ASSERT(n==4);
2774     REGEX_ASSERT(fields[0]=="Now");
2775     REGEX_ASSERT(fields[1]=="is");
2776     REGEX_ASSERT(fields[2]=="the");
2777     REGEX_ASSERT(fields[3]=="time");
2778     REGEX_ASSERT(fields[4]=="");
2779
2780     n = pat1->split("Now is the time", fields, 2, status);
2781     REGEX_CHECK_STATUS;
2782     REGEX_ASSERT(n==2);
2783     REGEX_ASSERT(fields[0]=="Now");
2784     REGEX_ASSERT(fields[1]=="is the time");
2785     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2786
2787     fields[1] = "*";
2788     status = U_ZERO_ERROR;
2789     n = pat1->split("Now is the time", fields, 1, status);
2790     REGEX_CHECK_STATUS;
2791     REGEX_ASSERT(n==1);
2792     REGEX_ASSERT(fields[0]=="Now is the time");
2793     REGEX_ASSERT(fields[1]=="*");
2794     status = U_ZERO_ERROR;
2795
2796     n = pat1->split("    Now       is the time   ", fields, 10, status);
2797     REGEX_CHECK_STATUS;
2798     REGEX_ASSERT(n==5);
2799     REGEX_ASSERT(fields[0]=="");
2800     REGEX_ASSERT(fields[1]=="Now");
2801     REGEX_ASSERT(fields[2]=="is");
2802     REGEX_ASSERT(fields[3]=="the");
2803     REGEX_ASSERT(fields[4]=="time");
2804     REGEX_ASSERT(fields[5]=="");
2805
2806     n = pat1->split("     ", fields, 10, status);
2807     REGEX_CHECK_STATUS;
2808     REGEX_ASSERT(n==1);
2809     REGEX_ASSERT(fields[0]=="");
2810
2811     fields[0] = "foo";
2812     n = pat1->split("", fields, 10, status);
2813     REGEX_CHECK_STATUS;
2814     REGEX_ASSERT(n==0);
2815     REGEX_ASSERT(fields[0]=="foo");
2816
2817     delete pat1;
2818
2819     //  split, with a pattern with (capture)
2820     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2821     pat1 = RegexPattern::compile(&re1,  pe, status);
2822     REGEX_CHECK_STATUS;
2823
2824     status = U_ZERO_ERROR;
2825     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2826     REGEX_CHECK_STATUS;
2827     REGEX_ASSERT(n==6);
2828     REGEX_ASSERT(fields[0]=="");
2829     REGEX_ASSERT(fields[1]=="a");
2830     REGEX_ASSERT(fields[2]=="Now is ");
2831     REGEX_ASSERT(fields[3]=="b");
2832     REGEX_ASSERT(fields[4]=="the time");
2833     REGEX_ASSERT(fields[5]=="c");
2834     REGEX_ASSERT(fields[6]=="");
2835     REGEX_ASSERT(status==U_ZERO_ERROR);
2836
2837     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2838     REGEX_CHECK_STATUS;
2839     REGEX_ASSERT(n==6);
2840     REGEX_ASSERT(fields[0]=="  ");
2841     REGEX_ASSERT(fields[1]=="a");
2842     REGEX_ASSERT(fields[2]=="Now is ");
2843     REGEX_ASSERT(fields[3]=="b");
2844     REGEX_ASSERT(fields[4]=="the time");
2845     REGEX_ASSERT(fields[5]=="c");
2846     REGEX_ASSERT(fields[6]=="");
2847
2848     status = U_ZERO_ERROR;
2849     fields[6] = "foo";
2850     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
2851     REGEX_CHECK_STATUS;
2852     REGEX_ASSERT(n==6);
2853     REGEX_ASSERT(fields[0]=="  ");
2854     REGEX_ASSERT(fields[1]=="a");
2855     REGEX_ASSERT(fields[2]=="Now is ");
2856     REGEX_ASSERT(fields[3]=="b");
2857     REGEX_ASSERT(fields[4]=="the time");
2858     REGEX_ASSERT(fields[5]=="c");
2859     REGEX_ASSERT(fields[6]=="foo");
2860
2861     status = U_ZERO_ERROR;
2862     fields[5] = "foo";
2863     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
2864     REGEX_CHECK_STATUS;
2865     REGEX_ASSERT(n==5);
2866     REGEX_ASSERT(fields[0]=="  ");
2867     REGEX_ASSERT(fields[1]=="a");
2868     REGEX_ASSERT(fields[2]=="Now is ");
2869     REGEX_ASSERT(fields[3]=="b");
2870     REGEX_ASSERT(fields[4]=="the time<c>");
2871     REGEX_ASSERT(fields[5]=="foo");
2872
2873     status = U_ZERO_ERROR;
2874     fields[5] = "foo";
2875     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
2876     REGEX_CHECK_STATUS;
2877     REGEX_ASSERT(n==5);
2878     REGEX_ASSERT(fields[0]=="  ");
2879     REGEX_ASSERT(fields[1]=="a");
2880     REGEX_ASSERT(fields[2]=="Now is ");
2881     REGEX_ASSERT(fields[3]=="b");
2882     REGEX_ASSERT(fields[4]=="the time");
2883     REGEX_ASSERT(fields[5]=="foo");
2884
2885     status = U_ZERO_ERROR;
2886     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
2887     REGEX_CHECK_STATUS;
2888     REGEX_ASSERT(n==4);
2889     REGEX_ASSERT(fields[0]=="  ");
2890     REGEX_ASSERT(fields[1]=="a");
2891     REGEX_ASSERT(fields[2]=="Now is ");
2892     REGEX_ASSERT(fields[3]=="the time<c>");
2893     status = U_ZERO_ERROR;
2894     delete pat1;
2895
2896     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
2897     pat1 = RegexPattern::compile(&re1, pe, status);
2898     REGEX_CHECK_STATUS;
2899     n = pat1->split("1-10,20", fields, 10, status);
2900     REGEX_CHECK_STATUS;
2901     REGEX_ASSERT(n==5);
2902     REGEX_ASSERT(fields[0]=="1");
2903     REGEX_ASSERT(fields[1]=="-");
2904     REGEX_ASSERT(fields[2]=="10");
2905     REGEX_ASSERT(fields[3]==",");
2906     REGEX_ASSERT(fields[4]=="20");
2907     delete pat1;
2908
2909
2910     //
2911     // RegexPattern::pattern() and patternText()
2912     //
2913     pat1 = new RegexPattern();
2914     REGEX_ASSERT(pat1->pattern() == "");
2915     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
2916     delete pat1;
2917
2918     regextst_openUTF8FromInvariant(&re1, "(Hello, world)*", -1, &status);
2919     pat1 = RegexPattern::compile(&re1, pe, status);
2920     REGEX_CHECK_STATUS;
2921     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
2922     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
2923     delete pat1;
2924
2925     utext_close(&re1);
2926 }
2927
2928
2929 //---------------------------------------------------------------------------
2930 //
2931 //      Extended       A more thorough check for features of regex patterns
2932 //                     The test cases are in a separate data file,
2933 //                       source/tests/testdata/regextst.txt
2934 //                     A description of the test data format is included in that file.
2935 //
2936 //---------------------------------------------------------------------------
2937
2938 const char *
2939 RegexTest::getPath(char buffer[2048], const char *filename) {
2940     UErrorCode status=U_ZERO_ERROR;
2941     const char *testDataDirectory = IntlTest::getSourceTestData(status);
2942     if (U_FAILURE(status)) {
2943         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
2944         return NULL;
2945     }
2946
2947     strcpy(buffer, testDataDirectory);
2948     strcat(buffer, filename);
2949     return buffer;
2950 }
2951
2952 void RegexTest::Extended() {
2953     char tdd[2048];
2954     const char *srcPath;
2955     UErrorCode  status  = U_ZERO_ERROR;
2956     int32_t     lineNum = 0;
2957
2958     //
2959     //  Open and read the test data file.
2960     //
2961     srcPath=getPath(tdd, "regextst.txt");
2962     if(srcPath==NULL) {
2963         return; /* something went wrong, error already output */
2964     }
2965
2966     int32_t    len;
2967     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
2968     if (U_FAILURE(status)) {
2969         return; /* something went wrong, error already output */
2970     }
2971
2972     //
2973     //  Put the test data into a UnicodeString
2974     //
2975     UnicodeString testString(FALSE, testData, len);
2976
2977     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
2978     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
2979     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
2980
2981     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
2982     UnicodeString   testPattern;   // The pattern for test from the test file.
2983     UnicodeString   testFlags;     // the flags   for a test.
2984     UnicodeString   matchString;   // The marked up string to be used as input
2985
2986     if (U_FAILURE(status)){
2987         dataerrln("Construct RegexMatcher() error.");
2988         delete [] testData;
2989         return;
2990     }
2991
2992     //
2993     //  Loop over the test data file, once per line.
2994     //
2995     while (lineMat.find()) {
2996         lineNum++;
2997         if (U_FAILURE(status)) {
2998           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
2999         }
3000
3001         status = U_ZERO_ERROR;
3002         UnicodeString testLine = lineMat.group(1, status);
3003         if (testLine.length() == 0) {
3004             continue;
3005         }
3006
3007         //
3008         // Parse the test line.  Skip blank and comment only lines.
3009         // Separate out the three main fields - pattern, flags, target.
3010         //
3011
3012         commentMat.reset(testLine);
3013         if (commentMat.lookingAt(status)) {
3014             // This line is a comment, or blank.
3015             continue;
3016         }
3017
3018         //
3019         //  Pull out the pattern field, remove it from the test file line.
3020         //
3021         quotedStuffMat.reset(testLine);
3022         if (quotedStuffMat.lookingAt(status)) {
3023             testPattern = quotedStuffMat.group(2, status);
3024             testLine.remove(0, quotedStuffMat.end(0, status));
3025         } else {
3026             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3027             continue;
3028         }
3029
3030
3031         //
3032         //  Pull out the flags from the test file line.
3033         //
3034         flagsMat.reset(testLine);
3035         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3036         testFlags = flagsMat.group(1, status);
3037         if (flagsMat.group(2, status).length() > 0) {
3038             errln("Bad Match flag at line %d. Scanning %c\n",
3039                 lineNum, flagsMat.group(2, status).charAt(0));
3040             continue;
3041         }
3042         testLine.remove(0, flagsMat.end(0, status));
3043
3044         //
3045         //  Pull out the match string, as a whole.
3046         //    We'll process the <tags> later.
3047         //
3048         quotedStuffMat.reset(testLine);
3049         if (quotedStuffMat.lookingAt(status)) {
3050             matchString = quotedStuffMat.group(2, status);
3051             testLine.remove(0, quotedStuffMat.end(0, status));
3052         } else {
3053             errln("Bad match string at test file line %d", lineNum);
3054             continue;
3055         }
3056
3057         //
3058         //  The only thing left from the input line should be an optional trailing comment.
3059         //
3060         commentMat.reset(testLine);
3061         if (commentMat.lookingAt(status) == FALSE) {
3062             errln("Line %d: unexpected characters at end of test line.", lineNum);
3063             continue;
3064         }
3065
3066         //
3067         //  Run the test
3068         //
3069         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3070     }
3071
3072     delete [] testData;
3073
3074 }
3075
3076
3077
3078 //---------------------------------------------------------------------------
3079 //
3080 //    regex_find(pattern, flags, inputString, lineNumber)
3081 //
3082 //         Function to run a single test from the Extended (data driven) tests.
3083 //         See file test/testdata/regextst.txt for a description of the
3084 //         pattern and inputString fields, and the allowed flags.
3085 //         lineNumber is the source line in regextst.txt of the test.
3086 //
3087 //---------------------------------------------------------------------------
3088
3089
3090 //  Set a value into a UVector at position specified by a decimal number in
3091 //   a UnicodeString.   This is a utility function needed by the actual test function,
3092 //   which follows.
3093 static void set(UVector &vec, int32_t val, UnicodeString index) {
3094     UErrorCode  status=U_ZERO_ERROR;
3095     int32_t  idx = 0;
3096     for (int32_t i=0; i<index.length(); i++) {
3097         int32_t d=u_charDigitValue(index.charAt(i));
3098         if (d<0) {return;}
3099         idx = idx*10 + d;
3100     }
3101     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3102     vec.setElementAt(val, idx);
3103 }
3104
3105 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3106     UErrorCode  status=U_ZERO_ERROR;
3107     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3108     vec.setElementAt(val, idx);
3109 }
3110
3111 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3112 {
3113     UBool couldFind = TRUE;
3114     UTEXT_SETNATIVEINDEX(utext, 0);
3115     int32_t i = 0;
3116     while (i < unistrOffset) {
3117         UChar32 c = UTEXT_NEXT32(utext);
3118         if (c != U_SENTINEL) {
3119             i += U16_LENGTH(c);
3120         } else {
3121             couldFind = FALSE;
3122             break;
3123         }
3124     }
3125     nativeIndex = UTEXT_GETNATIVEINDEX(utext);
3126     return couldFind;
3127 }
3128
3129
3130 void RegexTest::regex_find(const UnicodeString &pattern,
3131                            const UnicodeString &flags,
3132                            const UnicodeString &inputString,
3133                            const char *srcPath,
3134                            int32_t line) {
3135     UnicodeString       unEscapedInput;
3136     UnicodeString       deTaggedInput;
3137
3138     int32_t             patternUTF8Length,      inputUTF8Length;
3139     char                *patternChars  = NULL, *inputChars = NULL;
3140     UText               patternText    = UTEXT_INITIALIZER;
3141     UText               inputText      = UTEXT_INITIALIZER;
3142     UConverter          *UTF8Converter = NULL;
3143
3144     UErrorCode          status         = U_ZERO_ERROR;
3145     UParseError         pe;
3146     RegexPattern        *parsePat      = NULL;
3147     RegexMatcher        *parseMatcher  = NULL;
3148     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3149     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3150     UVector             groupStarts(status);
3151     UVector             groupEnds(status);
3152     UVector             groupStartsUTF8(status);
3153     UVector             groupEndsUTF8(status);
3154     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3155     UBool               failed         = FALSE;
3156     int32_t             numFinds;
3157     int32_t             i;
3158     UBool               useMatchesFunc   = FALSE;
3159     UBool               useLookingAtFunc = FALSE;
3160     int32_t             regionStart      = -1;
3161     int32_t             regionEnd        = -1;
3162     int32_t             regionStartUTF8  = -1;
3163     int32_t             regionEndUTF8    = -1;
3164
3165
3166     //
3167     //  Compile the caller's pattern
3168     //
3169     uint32_t bflags = 0;
3170     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3171         bflags |= UREGEX_CASE_INSENSITIVE;
3172     }
3173     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3174         bflags |= UREGEX_COMMENTS;
3175     }
3176     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3177         bflags |= UREGEX_DOTALL;
3178     }
3179     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3180         bflags |= UREGEX_MULTILINE;
3181     }
3182
3183     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3184         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3185     }
3186     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3187         bflags |= UREGEX_UNIX_LINES;
3188     }
3189
3190
3191     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3192     if (status != U_ZERO_ERROR) {
3193         #if UCONFIG_NO_BREAK_ITERATION==1
3194         // 'v' test flag means that the test pattern should not compile if ICU was configured
3195         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3196         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3197             goto cleanupAndReturn;
3198         }
3199         #endif
3200         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3201             // Expected pattern compilation error.
3202             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3203                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3204             }
3205             goto cleanupAndReturn;
3206         } else {
3207             // Unexpected pattern compilation error.
3208             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3209             goto cleanupAndReturn;
3210         }
3211     }
3212
3213     UTF8Converter = ucnv_open("UTF8", &status);
3214     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3215
3216     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3217     status = U_ZERO_ERROR; // buffer overflow
3218     patternChars = new char[patternUTF8Length+1];
3219     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3220     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3221
3222     if (status == U_ZERO_ERROR) {
3223         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3224
3225         if (status != U_ZERO_ERROR) {
3226 #if UCONFIG_NO_BREAK_ITERATION==1
3227             // 'v' test flag means that the test pattern should not compile if ICU was configured
3228             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3229             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3230                 goto cleanupAndReturn;
3231             }
3232 #endif
3233             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3234                 // Expected pattern compilation error.
3235                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3236                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3237                 }
3238                 goto cleanupAndReturn;
3239             } else {
3240                 // Unexpected pattern compilation error.
3241                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3242                 goto cleanupAndReturn;
3243             }
3244         }
3245     }
3246
3247     if (UTF8Pattern == NULL) {
3248         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3249         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3250         status = U_ZERO_ERROR;
3251     }
3252
3253     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3254         RegexPatternDump(callerPattern);
3255     }
3256
3257     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3258         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3259         goto cleanupAndReturn;
3260     }
3261
3262
3263     //
3264     // Number of times find() should be called on the test string, default to 1
3265     //
3266     numFinds = 1;
3267     for (i=2; i<=9; i++) {
3268         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3269             if (numFinds != 1) {
3270                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3271                 goto cleanupAndReturn;
3272             }
3273             numFinds = i;
3274         }
3275     }
3276
3277     // 'M' flag.  Use matches() instead of find()
3278     if (flags.indexOf((UChar)0x4d) >= 0) {
3279         useMatchesFunc = TRUE;
3280     }
3281     if (flags.indexOf((UChar)0x4c) >= 0) {
3282         useLookingAtFunc = TRUE;
3283     }
3284
3285     //
3286     //  Find the tags in the input data, remove them, and record the group boundary
3287     //    positions.
3288     //
3289     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3290     REGEX_CHECK_STATUS_L(line);
3291
3292     unEscapedInput = inputString.unescape();
3293     parseMatcher = parsePat->matcher(unEscapedInput, status);
3294     REGEX_CHECK_STATUS_L(line);
3295     while(parseMatcher->find()) {
3296         parseMatcher->appendReplacement(deTaggedInput, "", status);
3297         REGEX_CHECK_STATUS;
3298         UnicodeString groupNum = parseMatcher->group(2, status);
3299         if (groupNum == "r") {
3300             // <r> or </r>, a region specification within the string
3301             if (parseMatcher->group(1, status) == "/") {
3302                 regionEnd = deTaggedInput.length();
3303             } else {
3304                 regionStart = deTaggedInput.length();
3305             }
3306         } else {
3307             // <digits> or </digits>, a group match boundary tag.
3308             if (parseMatcher->group(1, status) == "/") {
3309                 set(groupEnds, deTaggedInput.length(), groupNum);
3310             } else {
3311                 set(groupStarts, deTaggedInput.length(), groupNum);
3312             }
3313         }
3314     }
3315     parseMatcher->appendTail(deTaggedInput);
3316     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3317     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3318       errln("mismatched <r> tags");
3319       failed = TRUE;
3320       goto cleanupAndReturn;
3321     }
3322
3323     //
3324     //  Configure the matcher according to the flags specified with this test.
3325     //
3326     matcher = callerPattern->matcher(deTaggedInput, status);
3327     REGEX_CHECK_STATUS_L(line);
3328     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3329         matcher->setTrace(TRUE);
3330     }
3331
3332     if (UTF8Pattern != NULL) {
3333         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3334         status = U_ZERO_ERROR; // buffer overflow
3335         inputChars = new char[inputUTF8Length+1];
3336         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3337         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3338
3339         if (status == U_ZERO_ERROR) {
3340             UTF8Matcher = UTF8Pattern->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
3341             REGEX_CHECK_STATUS_L(line);
3342         }
3343
3344         if (UTF8Matcher == NULL) {
3345             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3346           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3347             status = U_ZERO_ERROR;
3348         }
3349     }
3350
3351     //
3352     //  Generate native indices for UTF8 versions of region and capture group info
3353     //
3354     if (UTF8Matcher != NULL) {
3355         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3356         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3357
3358         //  Fill out the native index UVector info.
3359         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3360         for (i=0; i<groupStarts.size(); i++) {
3361             int32_t  start = groupStarts.elementAti(i);
3362             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3363             if (start >= 0) {
3364                 int32_t  startUTF8;
3365                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3366                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3367                     failed = TRUE;
3368                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3369                 }
3370                 setInt(groupStartsUTF8, startUTF8, i);
3371             }
3372
3373             int32_t  end = groupEnds.elementAti(i);
3374             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3375             if (end >= 0) {
3376                 int32_t  endUTF8;
3377                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3378                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3379                     failed = TRUE;
3380                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3381                 }
3382                 setInt(groupEndsUTF8, endUTF8, i);
3383             }
3384         }
3385     }
3386
3387     if (regionStart>=0) {
3388        matcher->region(regionStart, regionEnd, status);
3389        REGEX_CHECK_STATUS_L(line);
3390        if (UTF8Matcher != NULL) {
3391            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3392            REGEX_CHECK_STATUS_L(line);
3393        }
3394     }
3395     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3396         matcher->useAnchoringBounds(FALSE);
3397         if (UTF8Matcher != NULL) {
3398             UTF8Matcher->useAnchoringBounds(FALSE);
3399         }
3400     }
3401     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3402         matcher->useTransparentBounds(TRUE);
3403         if (UTF8Matcher != NULL) {
3404             UTF8Matcher->useTransparentBounds(TRUE);
3405         }
3406     }
3407
3408
3409
3410     //
3411     // Do a find on the de-tagged input using the caller's pattern
3412     //     TODO: error on count>1 and not find().
3413     //           error on both matches() and lookingAt().
3414     //
3415     for (i=0; i<numFinds; i++) {
3416         if (useMatchesFunc) {
3417             isMatch = matcher->matches(status);
3418             if (UTF8Matcher != NULL) {
3419                isUTF8Match = UTF8Matcher->matches(status);
3420             }
3421         } else  if (useLookingAtFunc) {
3422             isMatch = matcher->lookingAt(status);
3423             if (UTF8Matcher != NULL) {
3424                 isUTF8Match = UTF8Matcher->lookingAt(status);
3425             }
3426         } else {
3427             isMatch = matcher->find();
3428             if (UTF8Matcher != NULL) {
3429                 isUTF8Match = UTF8Matcher->find();
3430             }
3431         }
3432     }
3433     matcher->setTrace(FALSE);
3434
3435     //
3436     // Match up the groups from the find() with the groups from the tags
3437     //
3438
3439     // number of tags should match number of groups from find operation.
3440     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3441     //   G option in test means that capture group data is not available in the
3442     //     expected results, so the check needs to be suppressed.
3443     if (isMatch == FALSE && groupStarts.size() != 0) {
3444         errln("Error at line %d:  Match expected, but none found.", line);
3445         failed = TRUE;
3446         goto cleanupAndReturn;
3447     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3448         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3449         failed = TRUE;
3450         goto cleanupAndReturn;
3451     }
3452
3453     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3454         // Only check for match / no match.  Don't check capture groups.
3455         if (isMatch && groupStarts.size() == 0) {
3456             errln("Error at line %d:  No match expected, but one found.", line);
3457             failed = TRUE;
3458         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3459             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3460             failed = TRUE;
3461         }
3462         goto cleanupAndReturn;
3463     }
3464
3465     REGEX_CHECK_STATUS_L(line);
3466     for (i=0; i<=matcher->groupCount(); i++) {
3467         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3468         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3469         if (matcher->start(i, status) != expectedStart) {
3470             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3471                 line, i, expectedStart, matcher->start(i, status));
3472             failed = TRUE;
3473             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3474         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3475             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3476                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3477             failed = TRUE;
3478             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3479         }
3480
3481         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3482         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3483         if (matcher->end(i, status) != expectedEnd) {
3484             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3485                 line, i, expectedEnd, matcher->end(i, status));
3486             failed = TRUE;
3487             // Error on end position;  keep going; real error is probably yet to come as group
3488             //   end positions work from end of the input data towards the front.
3489         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3490             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3491                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3492             failed = TRUE;
3493             // Error on end position;  keep going; real error is probably yet to come as group
3494             //   end positions work from end of the input data towards the front.
3495         }
3496     }
3497     if ( matcher->groupCount()+1 < groupStarts.size()) {
3498         errln("Error at line %d: Expected %d capture groups, found %d.",
3499             line, groupStarts.size()-1, matcher->groupCount());
3500         failed = TRUE;
3501         }
3502     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3503         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3504               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3505         failed = TRUE;
3506     }
3507
3508     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3509         matcher->requireEnd() == TRUE) {
3510         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3511         failed = TRUE;
3512     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3513         UTF8Matcher->requireEnd() == TRUE) {
3514         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3515         failed = TRUE;
3516     }
3517
3518     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3519         matcher->requireEnd() == FALSE) {
3520         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3521         failed = TRUE;
3522     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3523         UTF8Matcher->requireEnd() == FALSE) {
3524         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3525         failed = TRUE;
3526     }
3527
3528     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3529         matcher->hitEnd() == TRUE) {
3530         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3531         failed = TRUE;
3532     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3533                UTF8Matcher->hitEnd() == TRUE) {
3534         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3535         failed = TRUE;
3536     }
3537
3538     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3539         matcher->hitEnd() == FALSE) {
3540         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3541         failed = TRUE;
3542     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3543                UTF8Matcher->hitEnd() == FALSE) {
3544         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3545         failed = TRUE;
3546     }
3547
3548
3549 cleanupAndReturn:
3550     if (failed) {
3551         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3552             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3553         // callerPattern->dump();
3554     }
3555     delete parseMatcher;
3556     delete parsePat;
3557     delete UTF8Matcher;
3558     delete UTF8Pattern;
3559     delete matcher;
3560     delete callerPattern;
3561
3562     utext_close(&inputText);
3563     delete[] inputChars;
3564     utext_close(&patternText);
3565     delete[] patternChars;
3566     ucnv_close(UTF8Converter);
3567 }
3568
3569
3570
3571
3572 //---------------------------------------------------------------------------
3573 //
3574 //      Errors     Check for error handling in patterns.
3575 //
3576 //---------------------------------------------------------------------------
3577 void RegexTest::Errors() {
3578     // \escape sequences that aren't implemented yet.
3579     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3580
3581     // Missing close parentheses
3582     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3583     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3584     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3585
3586     // Extra close paren
3587     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3588     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3589     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3590
3591     // Look-ahead, Look-behind
3592     //  TODO:  add tests for unbounded length look-behinds.
3593     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3594
3595     // Attempt to use non-default flags
3596     {
3597         UParseError   pe;
3598         UErrorCode    status = U_ZERO_ERROR;
3599         int32_t       flags  = UREGEX_CANON_EQ |
3600                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3601                                UREGEX_MULTILINE;
3602         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3603         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3604         delete pat1;
3605     }
3606
3607
3608     // Quantifiers are allowed only after something that can be quantified.
3609     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3610     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3611     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3612
3613     // Mal-formed {min,max} quantifiers
3614     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3615     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3616     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3617     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3618     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3619     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3620     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3621     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3622     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3623
3624     // Ticket 5389
3625     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3626
3627     // Invalid Back Reference \0
3628     //    For ICU 3.8 and earlier
3629     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3630     //
3631     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3632
3633 }
3634
3635
3636 //-------------------------------------------------------------------------------
3637 //
3638 //  Read a text data file, convert it to UChars, and return the data
3639 //    in one big UChar * buffer, which the caller must delete.
3640 //
3641 //--------------------------------------------------------------------------------
3642 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3643                                      const char *defEncoding, UErrorCode &status) {
3644     UChar       *retPtr  = NULL;
3645     char        *fileBuf = NULL;
3646     UConverter* conv     = NULL;
3647     FILE        *f       = NULL;
3648
3649     ulen = 0;
3650     if (U_FAILURE(status)) {
3651         return retPtr;
3652     }
3653
3654     //
3655     //  Open the file.
3656     //
3657     f = fopen(fileName, "rb");
3658     if (f == 0) {
3659         dataerrln("Error opening test data file %s\n", fileName);
3660         status = U_FILE_ACCESS_ERROR;
3661         return NULL;
3662     }
3663     //
3664     //  Read it in
3665     //
3666     int32_t            fileSize;
3667     int32_t            amt_read;
3668
3669     fseek( f, 0, SEEK_END);
3670     fileSize = ftell(f);
3671     fileBuf = new char[fileSize];
3672     fseek(f, 0, SEEK_SET);
3673     amt_read = fread(fileBuf, 1, fileSize, f);
3674     if (amt_read != fileSize || fileSize <= 0) {
3675         errln("Error reading test data file.");
3676         goto cleanUpAndReturn;
3677     }
3678
3679     //
3680     // Look for a Unicode Signature (BOM) on the data just read
3681     //
3682     int32_t        signatureLength;
3683     const char *   fileBufC;
3684     const char*    encoding;
3685
3686     fileBufC = fileBuf;
3687     encoding = ucnv_detectUnicodeSignature(
3688         fileBuf, fileSize, &signatureLength, &status);
3689     if(encoding!=NULL ){
3690         fileBufC  += signatureLength;
3691         fileSize  -= signatureLength;
3692     } else {
3693         encoding = defEncoding;
3694         if (strcmp(encoding, "utf-8") == 0) {
3695             errln("file %s is missing its BOM", fileName);
3696         }
3697     }
3698
3699     //
3700     // Open a converter to take the rule file to UTF-16
3701     //
3702     conv = ucnv_open(encoding, &status);
3703     if (U_FAILURE(status)) {
3704         goto cleanUpAndReturn;
3705     }
3706
3707     //
3708     // Convert the rules to UChar.
3709     //  Preflight first to determine required buffer size.
3710     //
3711     ulen = ucnv_toUChars(conv,
3712         NULL,           //  dest,
3713         0,              //  destCapacity,
3714         fileBufC,
3715         fileSize,
3716         &status);
3717     if (status == U_BUFFER_OVERFLOW_ERROR) {
3718         // Buffer Overflow is expected from the preflight operation.
3719         status = U_ZERO_ERROR;
3720
3721         retPtr = new UChar[ulen+1];
3722         ucnv_toUChars(conv,
3723             retPtr,       //  dest,
3724             ulen+1,
3725             fileBufC,
3726             fileSize,
3727             &status);
3728     }
3729
3730 cleanUpAndReturn:
3731     fclose(f);
3732     delete[] fileBuf;
3733     ucnv_close(conv);
3734     if (U_FAILURE(status)) {
3735         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3736         delete retPtr;
3737         retPtr = 0;
3738         ulen   = 0;
3739     };
3740     return retPtr;
3741 }
3742
3743
3744 //-------------------------------------------------------------------------------
3745 //
3746 //   PerlTests  - Run Perl's regular expression tests
3747 //                The input file for this test is re_tests, the standard regular
3748 //                expression test data distributed with the Perl source code.
3749 //
3750 //                Here is Perl's description of the test data file:
3751 //
3752 //        # The tests are in a separate file 't/op/re_tests'.
3753 //        # Each line in that file is a separate test.
3754 //        # There are five columns, separated by tabs.
3755 //        #
3756 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3757 //        # Modifiers can be put after the closing C<'>.
3758 //        #
3759 //        # Column 2 contains the string to be matched.
3760 //        #
3761 //        # Column 3 contains the expected result:
3762 //        #     y   expect a match
3763 //        #     n   expect no match
3764 //        #     c   expect an error
3765 //        # B   test exposes a known bug in Perl, should be skipped
3766 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3767 //        #
3768 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3769 //        #
3770 //        # Column 4 contains a string, usually C<$&>.
3771 //        #
3772 //        # Column 5 contains the expected result of double-quote
3773 //        # interpolating that string after the match, or start of error message.
3774 //        #
3775 //        # Column 6, if present, contains a reason why the test is skipped.
3776 //        # This is printed with "skipped", for harness to pick up.
3777 //        #
3778 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3779 //        #
3780 //        # If you want to add a regular expression test that can't be expressed
3781 //        # in this format, don't add it here: put it in op/pat.t instead.
3782 //
3783 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3784 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3785 //        (The i is in addition to whatever was there before.)
3786 //
3787 //-------------------------------------------------------------------------------
3788 void RegexTest::PerlTests() {
3789     char tdd[2048];
3790     const char *srcPath;
3791     UErrorCode  status = U_ZERO_ERROR;
3792     UParseError pe;
3793
3794     //
3795     //  Open and read the test data file.
3796     //
3797     srcPath=getPath(tdd, "re_tests.txt");
3798     if(srcPath==NULL) {
3799         return; /* something went wrong, error already output */
3800     }
3801
3802     int32_t    len;
3803     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3804     if (U_FAILURE(status)) {
3805         return; /* something went wrong, error already output */
3806     }
3807
3808     //
3809     //  Put the test data into a UnicodeString
3810     //
3811     UnicodeString testDataString(FALSE, testData, len);
3812
3813     //
3814     //  Regex to break the input file into lines, and strip the new lines.
3815     //     One line per match, capture group one is the desired data.
3816     //
3817     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3818     if (U_FAILURE(status)) {
3819         dataerrln("RegexPattern::compile() error");
3820         return;
3821     }
3822     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3823
3824     //
3825     //  Regex to split a test file line into fields.
3826     //    There are six fields, separated by tabs.
3827     //
3828     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3829
3830     //
3831     //  Regex to identify test patterns with flag settings, and to separate them.
3832     //    Test patterns with flags look like 'pattern'i
3833     //    Test patterns without flags are not quoted:   pattern
3834     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3835     //
3836     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3837     RegexMatcher* flagMat = flagPat->matcher(status);
3838
3839     //
3840     // The Perl tests reference several perl-isms, which are evaluated/substituted
3841     //   in the test data.  Not being perl, this must be done explicitly.  Here
3842     //   are string constants and REs for these constructs.
3843     //
3844     UnicodeString nulnulSrc("${nulnul}");
3845     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3846     nulnul = nulnul.unescape();
3847
3848     UnicodeString ffffSrc("${ffff}");
3849     UnicodeString ffff("\\uffff", -1, US_INV);
3850     ffff = ffff.unescape();
3851
3852     //  regexp for $-[0], $+[2], etc.
3853     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
3854     RegexMatcher *groupsMat = groupsPat->matcher(status);
3855
3856     //  regexp for $0, $1, $2, etc.
3857     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
3858     RegexMatcher *cgMat = cgPat->matcher(status);
3859
3860
3861     //
3862     // Main Loop for the Perl Tests, runs once per line from the
3863     //   test data file.
3864     //
3865     int32_t  lineNum = 0;
3866     int32_t  skippedUnimplementedCount = 0;
3867     while (lineMat->find()) {
3868         lineNum++;
3869
3870         //
3871         //  Get a line, break it into its fields, do the Perl
3872         //    variable substitutions.
3873         //
3874         UnicodeString line = lineMat->group(1, status);
3875         UnicodeString fields[7];
3876         fieldPat->split(line, fields, 7, status);
3877
3878         flagMat->reset(fields[0]);
3879         flagMat->matches(status);
3880         UnicodeString pattern  = flagMat->group(2, status);
3881         pattern.findAndReplace("${bang}", "!");
3882         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
3883         pattern.findAndReplace(ffffSrc, ffff);
3884
3885         //
3886         //  Identify patterns that include match flag settings,
3887         //    split off the flags, remove the extra quotes.
3888         //
3889         UnicodeString flagStr = flagMat->group(3, status);
3890         if (U_FAILURE(status)) {
3891             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3892             return;
3893         }
3894         int32_t flags = 0;
3895         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
3896         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
3897         const UChar UChar_m = 0x6d;
3898         const UChar UChar_x = 0x78;
3899         const UChar UChar_y = 0x79;
3900         if (flagStr.indexOf(UChar_i) != -1) {
3901             flags |= UREGEX_CASE_INSENSITIVE;
3902         }
3903         if (flagStr.indexOf(UChar_m) != -1) {
3904             flags |= UREGEX_MULTILINE;
3905         }
3906         if (flagStr.indexOf(UChar_x) != -1) {
3907             flags |= UREGEX_COMMENTS;
3908         }
3909
3910         //
3911         // Compile the test pattern.
3912         //
3913         status = U_ZERO_ERROR;
3914         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
3915         if (status == U_REGEX_UNIMPLEMENTED) {
3916             //
3917             // Test of a feature that is planned for ICU, but not yet implemented.
3918             //   skip the test.
3919             skippedUnimplementedCount++;
3920             delete testPat;
3921             status = U_ZERO_ERROR;
3922             continue;
3923         }
3924
3925         if (U_FAILURE(status)) {
3926             // Some tests are supposed to generate errors.
3927             //   Only report an error for tests that are supposed to succeed.
3928             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
3929                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
3930             {
3931                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
3932             }
3933             status = U_ZERO_ERROR;
3934             delete testPat;
3935             continue;
3936         }
3937
3938         if (fields[2].indexOf(UChar_i) >= 0) {
3939             // ICU should skip this test.
3940             delete testPat;
3941             continue;
3942         }
3943
3944         if (fields[2].indexOf(UChar_c) >= 0) {
3945             // This pattern should have caused a compilation error, but didn't/
3946             errln("line %d: Expected a pattern compile error, got success.", lineNum);
3947             delete testPat;
3948             continue;
3949         }
3950
3951         //
3952         // replace the Perl variables that appear in some of the
3953         //   match data strings.
3954         //
3955         UnicodeString matchString = fields[1];
3956         matchString.findAndReplace(nulnulSrc, nulnul);
3957         matchString.findAndReplace(ffffSrc,   ffff);
3958
3959         // Replace any \n in the match string with an actual new-line char.
3960         //  Don't do full unescape, as this unescapes more than Perl does, which
3961         //  causes other spurious failures in the tests.
3962         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
3963
3964
3965
3966         //
3967         // Run the test, check for expected match/don't match result.
3968         //
3969         RegexMatcher *testMat = testPat->matcher(matchString, status);
3970         UBool found = testMat->find();
3971         UBool expected = FALSE;
3972         if (fields[2].indexOf(UChar_y) >=0) {
3973             expected = TRUE;
3974         }
3975         if (expected != found) {
3976             errln("line %d: Expected %smatch, got %smatch",
3977                 lineNum, expected?"":"no ", found?"":"no " );
3978             continue;
3979         }
3980
3981         // Don't try to check expected results if there is no match.
3982         //   (Some have stuff in the expected fields)
3983         if (!found) {
3984             delete testMat;
3985             delete testPat;
3986             continue;
3987         }
3988
3989         //
3990         // Interpret the Perl expression from the fourth field of the data file,
3991         // building up an ICU string from the results of the ICU match.
3992         //   The Perl expression will contain references to the results of
3993         //     a regex match, including the matched string, capture group strings,
3994         //     group starting and ending indicies, etc.
3995         //
3996         UnicodeString resultString;
3997         UnicodeString perlExpr = fields[3];
3998 #if SUPPORT_MUTATING_INPUT_STRING
3999         groupsMat->reset(perlExpr);
4000         cgMat->reset(perlExpr);
4001 #endif
4002
4003         while (perlExpr.length() > 0) {
4004 #if !SUPPORT_MUTATING_INPUT_STRING
4005             //  Perferred usage.  Reset after any modification to input string.
4006             groupsMat->reset(perlExpr);
4007             cgMat->reset(perlExpr);
4008 #endif
4009
4010             if (perlExpr.startsWith("$&")) {
4011                 resultString.append(testMat->group(status));
4012                 perlExpr.remove(0, 2);
4013             }
4014
4015             else if (groupsMat->lookingAt(status)) {
4016                 // $-[0]   $+[2]  etc.
4017                 UnicodeString digitString = groupsMat->group(2, status);
4018                 int32_t t = 0;
4019                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4020                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4021                 int32_t matchPosition;
4022                 if (plusOrMinus.compare("+") == 0) {
4023                     matchPosition = testMat->end(groupNum, status);
4024                 } else {
4025                     matchPosition = testMat->start(groupNum, status);
4026                 }
4027                 if (matchPosition != -1) {
4028                     ICU_Utility::appendNumber(resultString, matchPosition);
4029                 }
4030                 perlExpr.remove(0, groupsMat->end(status));
4031             }
4032
4033             else if (cgMat->lookingAt(status)) {
4034                 // $1, $2, $3, etc.
4035                 UnicodeString digitString = cgMat->group(1, status);
4036                 int32_t t = 0;
4037                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4038                 if (U_SUCCESS(status)) {
4039                     resultString.append(testMat->group(groupNum, status));
4040                     status = U_ZERO_ERROR;
4041                 }
4042                 perlExpr.remove(0, cgMat->end(status));
4043             }
4044
4045             else if (perlExpr.startsWith("@-")) {
4046                 int32_t i;
4047                 for (i=0; i<=testMat->groupCount(); i++) {
4048                     if (i>0) {
4049                         resultString.append(" ");
4050                     }
4051                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4052                 }
4053                 perlExpr.remove(0, 2);
4054             }
4055
4056             else if (perlExpr.startsWith("@+")) {
4057                 int32_t i;
4058                 for (i=0; i<=testMat->groupCount(); i++) {
4059                     if (i>0) {
4060                         resultString.append(" ");
4061                     }
4062                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4063                 }
4064                 perlExpr.remove(0, 2);
4065             }
4066
4067             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4068                                                      //           or as an escaped sequence (e.g. \n)
4069                 if (perlExpr.length() > 1) {
4070                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4071                 }
4072                 UChar c = perlExpr.charAt(0);
4073                 switch (c) {
4074                 case 'n':   c = '\n'; break;
4075                 // add any other escape sequences that show up in the test expected results.
4076                 }
4077                 resultString.append(c);
4078                 perlExpr.remove(0, 1);
4079             }
4080
4081             else  {
4082                 // Any characters from the perl expression that we don't explicitly
4083                 //  recognize before here are assumed to be literals and copied
4084                 //  as-is to the expected results.
4085                 resultString.append(perlExpr.charAt(0));
4086                 perlExpr.remove(0, 1);
4087             }
4088
4089             if (U_FAILURE(status)) {
4090                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4091                 break;
4092             }
4093         }
4094
4095         //
4096         // Expected Results Compare
4097         //
4098         UnicodeString expectedS(fields[4]);
4099         expectedS.findAndReplace(nulnulSrc, nulnul);
4100         expectedS.findAndReplace(ffffSrc,   ffff);
4101         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4102
4103
4104         if (expectedS.compare(resultString) != 0) {
4105             err("Line %d: Incorrect perl expression results.", lineNum);
4106             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4107         }
4108
4109         delete testMat;
4110         delete testPat;
4111     }
4112
4113     //
4114     // All done.  Clean up allocated stuff.
4115     //
4116     delete cgMat;
4117     delete cgPat;
4118
4119     delete groupsMat;
4120     delete groupsPat;
4121
4122     delete flagMat;
4123     delete flagPat;
4124
4125     delete lineMat;
4126     delete linePat;
4127
4128     delete fieldPat;
4129     delete [] testData;
4130
4131
4132     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4133
4134 }
4135
4136
4137 //-------------------------------------------------------------------------------
4138 //
4139 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4140 //                  (instead of using UnicodeStrings) to test the alternate engine.
4141 //                  The input file for this test is re_tests, the standard regular
4142 //                  expression test data distributed with the Perl source code.
4143 //                  See PerlTests() for more information.
4144 //
4145 //-------------------------------------------------------------------------------
4146 void RegexTest::PerlTestsUTF8() {
4147     char tdd[2048];
4148     const char *srcPath;
4149     UErrorCode  status = U_ZERO_ERROR;
4150     UParseError pe;
4151     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4152     UText       patternText = UTEXT_INITIALIZER;
4153     char       *patternChars = NULL;
4154     int32_t     patternLength;
4155     int32_t     patternCapacity = 0;
4156     UText       inputText = UTEXT_INITIALIZER;
4157     char       *inputChars = NULL;
4158     int32_t     inputLength;
4159     int32_t     inputCapacity = 0;
4160
4161     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4162
4163     //
4164     //  Open and read the test data file.
4165     //
4166     srcPath=getPath(tdd, "re_tests.txt");
4167     if(srcPath==NULL) {
4168         return; /* something went wrong, error already output */
4169     }
4170
4171     int32_t    len;
4172     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4173     if (U_FAILURE(status)) {
4174         return; /* something went wrong, error already output */
4175     }
4176
4177     //
4178     //  Put the test data into a UnicodeString
4179     //
4180     UnicodeString testDataString(FALSE, testData, len);
4181
4182     //
4183     //  Regex to break the input file into lines, and strip the new lines.
4184     //     One line per match, capture group one is the desired data.
4185     //
4186     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4187     if (U_FAILURE(status)) {
4188         dataerrln("RegexPattern::compile() error");
4189         return;
4190     }
4191     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4192
4193     //
4194     //  Regex to split a test file line into fields.
4195     //    There are six fields, separated by tabs.
4196     //
4197     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4198
4199     //
4200     //  Regex to identify test patterns with flag settings, and to separate them.
4201     //    Test patterns with flags look like 'pattern'i
4202     //    Test patterns without flags are not quoted:   pattern
4203     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4204     //
4205     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4206     RegexMatcher* flagMat = flagPat->matcher(status);
4207
4208     //
4209     // The Perl tests reference several perl-isms, which are evaluated/substituted
4210     //   in the test data.  Not being perl, this must be done explicitly.  Here
4211     //   are string constants and REs for these constructs.
4212     //
4213     UnicodeString nulnulSrc("${nulnul}");
4214     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4215     nulnul = nulnul.unescape();
4216
4217     UnicodeString ffffSrc("${ffff}");
4218     UnicodeString ffff("\\uffff", -1, US_INV);
4219     ffff = ffff.unescape();
4220
4221     //  regexp for $-[0], $+[2], etc.
4222     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4223     RegexMatcher *groupsMat = groupsPat->matcher(status);
4224
4225     //  regexp for $0, $1, $2, etc.
4226     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4227     RegexMatcher *cgMat = cgPat->matcher(status);
4228
4229
4230     //
4231     // Main Loop for the Perl Tests, runs once per line from the
4232     //   test data file.
4233     //
4234     int32_t  lineNum = 0;
4235     int32_t  skippedUnimplementedCount = 0;
4236     while (lineMat->find()) {
4237         lineNum++;
4238
4239         //
4240         //  Get a line, break it into its fields, do the Perl
4241         //    variable substitutions.
4242         //
4243         UnicodeString line = lineMat->group(1, status);
4244         UnicodeString fields[7];
4245         fieldPat->split(line, fields, 7, status);
4246
4247         flagMat->reset(fields[0]);
4248         flagMat->matches(status);
4249         UnicodeString pattern  = flagMat->group(2, status);
4250         pattern.findAndReplace("${bang}", "!");
4251         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4252         pattern.findAndReplace(ffffSrc, ffff);
4253
4254         //
4255         //  Identify patterns that include match flag settings,
4256         //    split off the flags, remove the extra quotes.
4257         //
4258         UnicodeString flagStr = flagMat->group(3, status);
4259         if (U_FAILURE(status)) {
4260             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4261             return;
4262         }
4263         int32_t flags = 0;
4264         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4265         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4266         const UChar UChar_m = 0x6d;
4267         const UChar UChar_x = 0x78;
4268         const UChar UChar_y = 0x79;
4269         if (flagStr.indexOf(UChar_i) != -1) {
4270             flags |= UREGEX_CASE_INSENSITIVE;
4271         }
4272         if (flagStr.indexOf(UChar_m) != -1) {
4273             flags |= UREGEX_MULTILINE;
4274         }
4275         if (flagStr.indexOf(UChar_x) != -1) {
4276             flags |= UREGEX_COMMENTS;
4277         }
4278
4279         //
4280         // Put the pattern in a UTF-8 UText
4281         //
4282         status = U_ZERO_ERROR;
4283         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4284         if (status == U_BUFFER_OVERFLOW_ERROR) {
4285             status = U_ZERO_ERROR;
4286             delete[] patternChars;
4287             patternCapacity = patternLength + 1;
4288             patternChars = new char[patternCapacity];
4289             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4290         }
4291         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4292
4293         //
4294         // Compile the test pattern.
4295         //
4296         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4297         if (status == U_REGEX_UNIMPLEMENTED) {
4298             //
4299             // Test of a feature that is planned for ICU, but not yet implemented.
4300             //   skip the test.
4301             skippedUnimplementedCount++;
4302             delete testPat;
4303             status = U_ZERO_ERROR;
4304             continue;
4305         }
4306
4307         if (U_FAILURE(status)) {
4308             // Some tests are supposed to generate errors.
4309             //   Only report an error for tests that are supposed to succeed.
4310             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4311                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4312             {
4313                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4314             }
4315             status = U_ZERO_ERROR;
4316             delete testPat;
4317             continue;
4318         }
4319
4320         if (fields[2].indexOf(UChar_i) >= 0) {
4321             // ICU should skip this test.
4322             delete testPat;
4323             continue;
4324         }
4325
4326         if (fields[2].indexOf(UChar_c) >= 0) {
4327             // This pattern should have caused a compilation error, but didn't/
4328             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4329             delete testPat;
4330             continue;
4331         }
4332
4333
4334         //
4335         // replace the Perl variables that appear in some of the
4336         //   match data strings.
4337         //
4338         UnicodeString matchString = fields[1];
4339         matchString.findAndReplace(nulnulSrc, nulnul);
4340         matchString.findAndReplace(ffffSrc,   ffff);
4341
4342         // Replace any \n in the match string with an actual new-line char.
4343         //  Don't do full unescape, as this unescapes more than Perl does, which
4344         //  causes other spurious failures in the tests.
4345         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4346
4347         //
4348         // Put the input in a UTF-8 UText
4349         //
4350         status = U_ZERO_ERROR;
4351         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4352         if (status == U_BUFFER_OVERFLOW_ERROR) {
4353             status = U_ZERO_ERROR;
4354             delete[] inputChars;
4355             inputCapacity = inputLength + 1;
4356             inputChars = new char[inputCapacity];
4357             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4358         }
4359         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4360
4361         //
4362         // Run the test, check for expected match/don't match result.
4363         //
4364         RegexMatcher *testMat = testPat->matcher(&inputText, RegexPattern::PATTERN_IS_UTEXT, status);
4365         UBool found = testMat->find();
4366         UBool expected = FALSE;
4367         if (fields[2].indexOf(UChar_y) >=0) {
4368             expected = TRUE;
4369         }
4370         if (expected != found) {
4371             errln("line %d: Expected %smatch, got %smatch",
4372                 lineNum, expected?"":"no ", found?"":"no " );
4373             continue;
4374         }
4375
4376         // Don't try to check expected results if there is no match.
4377         //   (Some have stuff in the expected fields)
4378         if (!found) {
4379             delete testMat;
4380             delete testPat;
4381             continue;
4382         }
4383
4384         //
4385         // Interpret the Perl expression from the fourth field of the data file,
4386         // building up an ICU string from the results of the ICU match.
4387         //   The Perl expression will contain references to the results of
4388         //     a regex match, including the matched string, capture group strings,
4389         //     group starting and ending indicies, etc.
4390         //
4391         UnicodeString resultString;
4392         UnicodeString perlExpr = fields[3];
4393
4394         while (perlExpr.length() > 0) {
4395             groupsMat->reset(perlExpr);
4396             cgMat->reset(perlExpr);
4397
4398             if (perlExpr.startsWith("$&")) {
4399                 resultString.append(testMat->group(status));
4400                 perlExpr.remove(0, 2);
4401             }
4402
4403             else if (groupsMat->lookingAt(status)) {
4404                 // $-[0]   $+[2]  etc.
4405                 UnicodeString digitString = groupsMat->group(2, status);
4406                 int32_t t = 0;
4407                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4408                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4409                 int32_t matchPosition;
4410                 if (plusOrMinus.compare("+") == 0) {
4411                     matchPosition = testMat->end(groupNum, status);
4412                 } else {
4413                     matchPosition = testMat->start(groupNum, status);
4414                 }
4415                 if (matchPosition != -1) {
4416                     ICU_Utility::appendNumber(resultString, matchPosition);
4417                 }
4418                 perlExpr.remove(0, groupsMat->end(status));
4419             }
4420
4421             else if (cgMat->lookingAt(status)) {
4422                 // $1, $2, $3, etc.
4423                 UnicodeString digitString = cgMat->group(1, status);
4424                 int32_t t = 0;
4425                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4426                 if (U_SUCCESS(status)) {
4427                     resultString.append(testMat->group(groupNum, status));
4428                     status = U_ZERO_ERROR;
4429                 }
4430                 perlExpr.remove(0, cgMat->end(status));
4431             }
4432
4433             else if (perlExpr.startsWith("@-")) {
4434                 int32_t i;
4435                 for (i=0; i<=testMat->groupCount(); i++) {
4436                     if (i>0) {
4437                         resultString.append(" ");
4438                     }
4439                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4440                 }
4441                 perlExpr.remove(0, 2);
4442             }
4443
4444             else if (perlExpr.startsWith("@+")) {
4445                 int32_t i;
4446                 for (i=0; i<=testMat->groupCount(); i++) {
4447                     if (i>0) {
4448                         resultString.append(" ");
4449                     }
4450                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4451                 }
4452                 perlExpr.remove(0, 2);
4453             }
4454
4455             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4456                                                      //           or as an escaped sequence (e.g. \n)
4457                 if (perlExpr.length() > 1) {
4458                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4459                 }
4460                 UChar c = perlExpr.charAt(0);
4461                 switch (c) {
4462                 case 'n':   c = '\n'; break;
4463                 // add any other escape sequences that show up in the test expected results.
4464                 }
4465                 resultString.append(c);
4466                 perlExpr.remove(0, 1);
4467             }
4468
4469             else  {
4470                 // Any characters from the perl expression that we don't explicitly
4471                 //  recognize before here are assumed to be literals and copied
4472                 //  as-is to the expected results.
4473                 resultString.append(perlExpr.charAt(0));
4474                 perlExpr.remove(0, 1);
4475             }
4476
4477             if (U_FAILURE(status)) {
4478                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4479                 break;
4480             }
4481         }
4482
4483         //
4484         // Expected Results Compare
4485         //
4486         UnicodeString expectedS(fields[4]);
4487         expectedS.findAndReplace(nulnulSrc, nulnul);
4488         expectedS.findAndReplace(ffffSrc,   ffff);
4489         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4490
4491
4492         if (expectedS.compare(resultString) != 0) {
4493             err("Line %d: Incorrect perl expression results.", lineNum);
4494             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4495         }
4496
4497         delete testMat;
4498         delete testPat;
4499     }
4500
4501     //
4502     // All done.  Clean up allocated stuff.
4503     //
4504     delete cgMat;
4505     delete cgPat;
4506
4507     delete groupsMat;
4508     delete groupsPat;
4509
4510     delete flagMat;
4511     delete flagPat;
4512
4513     delete lineMat;
4514     delete linePat;
4515
4516     delete fieldPat;
4517     delete [] testData;
4518
4519     utext_close(&patternText);
4520     utext_close(&inputText);
4521
4522     delete [] patternChars;
4523     delete [] inputChars;
4524
4525
4526     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4527
4528 }
4529
4530
4531 //--------------------------------------------------------------
4532 //
4533 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4534 //             Use this pattern,
4535 //                 "(a?){1,}"
4536 //             The zero-length match will repeat forever.
4537 //                (That this goes into a loop is another bug)
4538 //
4539 //---------------------------------------------------------------
4540 void RegexTest::Bug6149() {
4541     UnicodeString pattern("(a?){1,}");
4542     UnicodeString s("xyz");
4543     uint32_t flags = 0;
4544     UErrorCode status = U_ZERO_ERROR;
4545
4546     RegexMatcher  matcher(pattern, s, flags, status);
4547     UBool result = false;
4548     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4549     REGEX_ASSERT(result == FALSE);
4550  }
4551
4552
4553 //
4554 //   Callbacks()    Test the callback function.
4555 //                  When set, callbacks occur periodically during matching operations,
4556 //                  giving the application code the ability to abort the operation
4557 //                  before it's normal completion.
4558 //
4559
4560 struct callBackContext {
4561     RegexTest        *test;
4562     int32_t          maxCalls;
4563     int32_t          numCalls;
4564     int32_t          lastSteps;
4565     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4566 };
4567
4568 U_CDECL_BEGIN
4569 static UBool U_CALLCONV
4570 testCallBackFn(const void *context, int32_t steps) {
4571     callBackContext  *info = (callBackContext *)context;
4572     if (info->lastSteps+1 != steps) {
4573         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4574     }
4575     info->lastSteps = steps;
4576     info->numCalls++;
4577     return (info->numCalls < info->maxCalls);
4578 }
4579 U_CDECL_END
4580
4581 void RegexTest::Callbacks() {
4582    {
4583         // Getter returns NULLs if no callback has been set
4584
4585         //   The variables that the getter will fill in.
4586         //   Init to non-null values so that the action of the getter can be seen.
4587         const void          *returnedContext = &returnedContext;
4588         URegexMatchCallback *returnedFn = &testCallBackFn;
4589
4590         UErrorCode status = U_ZERO_ERROR;
4591         RegexMatcher matcher("x", 0, status);
4592         REGEX_CHECK_STATUS;
4593         matcher.getMatchCallback(returnedFn, returnedContext, status);
4594         REGEX_CHECK_STATUS;
4595         REGEX_ASSERT(returnedFn == NULL);
4596         REGEX_ASSERT(returnedContext == NULL);
4597     }
4598
4599    {
4600         // Set and Get work
4601         callBackContext cbInfo = {this, 0, 0, 0};
4602         const void          *returnedContext;
4603         URegexMatchCallback *returnedFn;
4604         UErrorCode status = U_ZERO_ERROR;
4605         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4606         REGEX_CHECK_STATUS;
4607         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4608         REGEX_CHECK_STATUS;
4609         matcher.getMatchCallback(returnedFn, returnedContext, status);
4610         REGEX_CHECK_STATUS;
4611         REGEX_ASSERT(returnedFn == testCallBackFn);
4612         REGEX_ASSERT(returnedContext == &cbInfo);
4613
4614         // A short-running match shouldn't invoke the callback
4615         status = U_ZERO_ERROR;
4616         cbInfo.reset(1);
4617         UnicodeString s = "xxx";
4618         matcher.reset(s);
4619         REGEX_ASSERT(matcher.matches(status));
4620         REGEX_CHECK_STATUS;
4621         REGEX_ASSERT(cbInfo.numCalls == 0);
4622
4623         // A medium-length match that runs long enough to invoke the
4624         //   callback, but not so long that the callback aborts it.
4625         status = U_ZERO_ERROR;
4626         cbInfo.reset(4);
4627         s = "aaaaaaaaaaaaaaaaaaab";
4628         matcher.reset(s);
4629         REGEX_ASSERT(matcher.matches(status)==FALSE);
4630         REGEX_CHECK_STATUS;
4631         REGEX_ASSERT(cbInfo.numCalls > 0);
4632
4633         // A longer running match that the callback function will abort.
4634         status = U_ZERO_ERROR;
4635         cbInfo.reset(4);
4636         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4637         matcher.reset(s);
4638         REGEX_ASSERT(matcher.matches(status)==FALSE);
4639         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4640         REGEX_ASSERT(cbInfo.numCalls == 4);
4641     }
4642
4643
4644 }
4645
4646
4647 //
4648 //   FindProgressCallbacks()    Test the find "progress" callback function.
4649 //                  When set, the find progress callback will be invoked during a find operations
4650 //                  after each return from a match attempt, giving the application the opportunity
4651 //                  to terminate a long-running find operation before it's normal completion.
4652 //
4653
4654 struct progressCallBackContext {
4655     RegexTest        *test;
4656     int64_t          lastIndex;
4657     int32_t          maxCalls;
4658     int32_t          numCalls;
4659     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4660 };
4661
4662 U_CDECL_BEGIN
4663 static UBool U_CALLCONV
4664 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4665     progressCallBackContext  *info = (progressCallBackContext *)context;
4666     info->numCalls++;
4667     info->lastIndex = matchIndex;
4668 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4669     return (info->numCalls < info->maxCalls);
4670 }
4671 U_CDECL_END
4672
4673 void RegexTest::FindProgressCallbacks() {
4674    {
4675         // Getter returns NULLs if no callback has been set
4676
4677         //   The variables that the getter will fill in.
4678         //   Init to non-null values so that the action of the getter can be seen.
4679         const void                  *returnedContext = &returnedContext;
4680         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4681
4682         UErrorCode status = U_ZERO_ERROR;
4683         RegexMatcher matcher("x", 0, status);
4684         REGEX_CHECK_STATUS;
4685         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4686         REGEX_CHECK_STATUS;
4687         REGEX_ASSERT(returnedFn == NULL);
4688         REGEX_ASSERT(returnedContext == NULL);
4689     }
4690
4691    {
4692         // Set and Get work
4693         progressCallBackContext cbInfo = {this, 0, 0, 0};
4694         const void                  *returnedContext;
4695         URegexFindProgressCallback  *returnedFn;
4696         UErrorCode status = U_ZERO_ERROR;
4697         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4698         REGEX_CHECK_STATUS;
4699         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4700         REGEX_CHECK_STATUS;
4701         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4702         REGEX_CHECK_STATUS;
4703         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4704         REGEX_ASSERT(returnedContext == &cbInfo);
4705
4706         // A short-running match should NOT invoke the callback.
4707         status = U_ZERO_ERROR;
4708         cbInfo.reset(100);
4709         UnicodeString s = "abxxx";
4710         matcher.reset(s);
4711 #if 0
4712         matcher.setTrace(TRUE);
4713 #endif
4714         REGEX_ASSERT(matcher.find(0, status));
4715         REGEX_CHECK_STATUS;
4716         REGEX_ASSERT(cbInfo.numCalls == 0);
4717
4718         // A medium running match that causes matcher.find() to invoke our callback for each index.
4719         status = U_ZERO_ERROR;
4720         s = "aaaaaaaaaaaaaaaaaaab";
4721         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4722         matcher.reset(s);
4723         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4724         REGEX_CHECK_STATUS;
4725         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4726
4727         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4728         status = U_ZERO_ERROR;
4729         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4730         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4731         matcher.reset(s1);
4732         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4733         REGEX_CHECK_STATUS;
4734         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4735
4736 #if 0
4737         // Now a match that will succeed, but after an interruption
4738         status = U_ZERO_ERROR;
4739         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4740         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4741         matcher.reset(s2);
4742         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4743         REGEX_CHECK_STATUS;
4744         // Now retry the match from where left off
4745         cbInfo.maxCalls = 100; //  No callback limit
4746         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4747         REGEX_CHECK_STATUS;
4748 #endif
4749     }
4750
4751
4752 }
4753
4754
4755 //---------------------------------------------------------------------------
4756 //
4757 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4758 //                             UTexts. The pure-C implementation of UText
4759 //                             has no mutable backing stores, but we can
4760 //                             use UnicodeString here to test the functionality.
4761 //
4762 //---------------------------------------------------------------------------
4763 void RegexTest::PreAllocatedUTextCAPI () {
4764     UErrorCode           status = U_ZERO_ERROR;
4765     URegularExpression  *re;
4766     UText                patternText = UTEXT_INITIALIZER;
4767     UnicodeString        buffer;
4768     UText                bufferText = UTEXT_INITIALIZER;
4769
4770     utext_openUnicodeString(&bufferText, &buffer, &status);
4771
4772     /*
4773      *  getText() and getUText()
4774      */
4775     {
4776         UText  text1 = UTEXT_INITIALIZER;
4777         UText  text2 = UTEXT_INITIALIZER;
4778         UChar  text2Chars[20];
4779         UText  *resultText;
4780
4781         status = U_ZERO_ERROR;
4782         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4783         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4784         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4785         utext_openUChars(&text2, text2Chars, -1, &status);
4786
4787         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4788         re = uregex_openUText(&patternText, 0, NULL, &status);
4789
4790         /* First set a UText */
4791         uregex_setUText(re, &text1, &status);
4792         resultText = uregex_getUText(re, &bufferText, &status);
4793         REGEX_CHECK_STATUS;
4794         REGEX_ASSERT(resultText == &bufferText);
4795         utext_setNativeIndex(resultText, 0);
4796         utext_setNativeIndex(&text1, 0);
4797         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4798
4799         resultText = uregex_getUText(re, &bufferText, &status);
4800         REGEX_CHECK_STATUS;
4801         REGEX_ASSERT(resultText == &bufferText);
4802         utext_setNativeIndex(resultText, 0);
4803         utext_setNativeIndex(&text1, 0);
4804         REGEX_ASSERT(utext_compare(resultText, -1, &text1, -1) == 0);
4805
4806         /* Then set a UChar * */
4807         uregex_setText(re, text2Chars, 7, &status);
4808         resultText = uregex_getUText(re, &bufferText, &status);
4809         REGEX_CHECK_STATUS;
4810         REGEX_ASSERT(resultText == &bufferText);
4811         utext_setNativeIndex(resultText, 0);
4812         utext_setNativeIndex(&text2, 0);
4813         REGEX_ASSERT(utext_compare(resultText, -1, &text2, -1) == 0);
4814
4815         uregex_close(re);
4816         utext_close(&text1);
4817         utext_close(&text2);
4818     }
4819
4820     /*
4821      *  group()
4822      */
4823     {
4824         UChar    text1[80];
4825         UText   *actual;
4826         UBool    result;
4827         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4828
4829         status = U_ZERO_ERROR;
4830         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4831         REGEX_CHECK_STATUS;
4832
4833         uregex_setText(re, text1, -1, &status);
4834         result = uregex_find(re, 0, &status);
4835         REGEX_ASSERT(result==TRUE);
4836
4837         /*  Capture Group 0, the full match.  Should succeed.  */
4838         status = U_ZERO_ERROR;
4839         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4840         REGEX_CHECK_STATUS;
4841         REGEX_ASSERT(actual == &bufferText);
4842         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4843
4844         /*  Capture group #1.  Should succeed. */
4845         status = U_ZERO_ERROR;
4846         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4847         REGEX_CHECK_STATUS;
4848         REGEX_ASSERT(actual == &bufferText);
4849         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
4850
4851         /*  Capture group out of range.  Error. */
4852         status = U_ZERO_ERROR;
4853         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
4854         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
4855         REGEX_ASSERT(actual == &bufferText);
4856
4857         uregex_close(re);
4858
4859     }
4860
4861     /*
4862      *  replaceFirst()
4863      */
4864     {
4865         UChar    text1[80];
4866         UChar    text2[80];
4867         UText    replText = UTEXT_INITIALIZER;
4868         UText   *result;
4869
4870         status = U_ZERO_ERROR;
4871         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4872         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4873         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4874
4875         re = uregex_openC("x(.*?)x", 0, NULL, &status);
4876         REGEX_CHECK_STATUS;
4877
4878         /*  Normal case, with match */
4879         uregex_setText(re, text1, -1, &status);
4880         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4881         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4882         REGEX_CHECK_STATUS;
4883         REGEX_ASSERT(result == &bufferText);
4884         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
4885
4886         /* No match.  Text should copy to output with no changes.  */
4887         uregex_setText(re, text2, -1, &status);
4888         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4889         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4890         REGEX_CHECK_STATUS;
4891         REGEX_ASSERT(result == &bufferText);
4892         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4893
4894         /* Unicode escapes */
4895         uregex_setText(re, text1, -1, &status);
4896         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
4897         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4898         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
4899         REGEX_CHECK_STATUS;
4900         REGEX_ASSERT(result == &bufferText);
4901         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
4902
4903         uregex_close(re);
4904         utext_close(&replText);
4905     }
4906
4907
4908     /*
4909      *  replaceAll()
4910      */
4911     {
4912         UChar    text1[80];
4913         UChar    text2[80];
4914         UText    replText = UTEXT_INITIALIZER;
4915         UText   *result;
4916
4917         status = U_ZERO_ERROR;
4918         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
4919         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
4920         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
4921
4922         re = uregex_openC("x(.*?)x", 0, NULL, &status);
4923         REGEX_CHECK_STATUS;
4924
4925         /*  Normal case, with match */
4926         uregex_setText(re, text1, -1, &status);
4927         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4928         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4929         REGEX_CHECK_STATUS;
4930         REGEX_ASSERT(result == &bufferText);
4931         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
4932
4933         /* No match.  Text should copy to output with no changes.  */
4934         uregex_setText(re, text2, -1, &status);
4935         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
4936         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
4937         REGEX_CHECK_STATUS;
4938         REGEX_ASSERT(result == &bufferText);
4939         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
4940
4941         uregex_close(re);
4942         utext_close(&replText);
4943     }
4944
4945
4946     /*
4947      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
4948      *   so we don't need to test it here.
4949      */
4950
4951     utext_close(&bufferText);
4952     utext_close(&patternText);
4953 }
4954
4955 //--------------------------------------------------------------
4956 //
4957 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
4958 //
4959 //---------------------------------------------------------------
4960 void RegexTest::Bug7651() {
4961     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
4962     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
4963     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
4964     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
4965     UnicodeString s("#ff @abcd This is test");
4966     RegexPattern  *REPattern = NULL;
4967     RegexMatcher  *REMatcher = NULL;
4968     UErrorCode status = U_ZERO_ERROR;
4969     UParseError pe;
4970
4971     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
4972     REGEX_CHECK_STATUS;
4973     REMatcher = REPattern->matcher(s, status);
4974     REGEX_CHECK_STATUS;
4975     REGEX_ASSERT(REMatcher->find());
4976     REGEX_ASSERT(REMatcher->start(status) == 0);
4977     delete REPattern;
4978     delete REMatcher;
4979     status = U_ZERO_ERROR;
4980
4981     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
4982     REGEX_CHECK_STATUS;
4983     REMatcher = REPattern->matcher(s, status);
4984     REGEX_CHECK_STATUS;
4985     REGEX_ASSERT(REMatcher->find());
4986     REGEX_ASSERT(REMatcher->start(status) == 0);
4987     delete REPattern;
4988     delete REMatcher;
4989     status = U_ZERO_ERROR;
4990  }
4991
4992 void RegexTest::Bug7740() {
4993     UErrorCode status = U_ZERO_ERROR;
4994     UnicodeString pattern = "(a)";
4995     UnicodeString text = "abcdef";
4996     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
4997     REGEX_CHECK_STATUS;
4998     REGEX_ASSERT(m->lookingAt(status));
4999     REGEX_CHECK_STATUS;
5000     status = U_ILLEGAL_ARGUMENT_ERROR;
5001     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5002     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5003     REGEX_ASSERT(s == "");
5004     delete m;
5005 }
5006
5007
5008
5009
5010 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5011