icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2014, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 /*
  14      NOTE!!
  15
  16      PLEASE be careful about ASCII assumptions in this test.
  17      This test is one of the worst repeat offenders.
  18      If you have questions, contact someone on the ICU PMC
  19      who has access to an EBCDIC system.
  20
  21  */
  22
  23 #include "intltest.h"
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  25
  26 #include "unicode/regex.h"
  27 #include "unicode/uchar.h"
  28 #include "unicode/ucnv.h"
  29 #include "unicode/uniset.h"
  30 #include "unicode/uregex.h"
  31 #include "unicode/ustring.h"
  32 #include "regextst.h"
  33 #include "uvector.h"
  34 #include "util.h"
  35 #include <stdlib.h>
  36 #include <string.h>
  37 #include <stdio.h>
  38 #include "cstring.h"
  39 #include "uinvchar.h"
  40
  41 #define SUPPORT_MUTATING_INPUT_STRING   0
  42
  43 //---------------------------------------------------------------------------
  44 //
  45 //  Test class boilerplate
  46 //
  47 //---------------------------------------------------------------------------
  48 RegexTest::RegexTest()
  49 {
  50 }
  51
  52
  53 RegexTest::~RegexTest()
  54 {
  55 }
  56
  57
  58
  59 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  60 {
  61     if (exec) logln("TestSuite RegexTest: ");
  62     switch (index) {
  63
  64         case 0: name = "Basic";
  65             if (exec) Basic();
  66             break;
  67         case 1: name = "API_Match";
  68             if (exec) API_Match();
  69             break;
  70         case 2: name = "API_Replace";
  71             if (exec) API_Replace();
  72             break;
  73         case 3: name = "API_Pattern";
  74             if (exec) API_Pattern();
  75             break;
  76         case 4:
  77 #if !UCONFIG_NO_FILE_IO
  78             name = "Extended";
  79             if (exec) Extended();
  80 #else
  81             name = "skip";
  82 #endif
  83             break;
  84         case 5: name = "Errors";
  85             if (exec) Errors();
  86             break;
  87         case 6: name = "PerlTests";
  88             if (exec) PerlTests();
  89             break;
  90         case 7: name = "Callbacks";
  91             if (exec) Callbacks();
  92             break;
  93         case 8: name = "FindProgressCallbacks";
  94             if (exec) FindProgressCallbacks();
  95             break;
  96         case 9: name = "Bug 6149";
  97              if (exec) Bug6149();
  98              break;
  99         case 10: name = "UTextBasic";
 100           if (exec) UTextBasic();
 101           break;
 102         case 11: name = "API_Match_UTF8";
 103           if (exec) API_Match_UTF8();
 104           break;
 105         case 12: name = "API_Replace_UTF8";
 106           if (exec) API_Replace_UTF8();
 107           break;
 108         case 13: name = "API_Pattern_UTF8";
 109           if (exec) API_Pattern_UTF8();
 110           break;
 111         case 14: name = "PerlTestsUTF8";
 112           if (exec) PerlTestsUTF8();
 113           break;
 114         case 15: name = "PreAllocatedUTextCAPI";
 115           if (exec) PreAllocatedUTextCAPI();
 116           break;
 117         case 16: name = "Bug 7651";
 118              if (exec) Bug7651();
 119              break;
 120         case 17: name = "Bug 7740";
 121             if (exec) Bug7740();
 122             break;
 123         case 18: name = "Bug 8479";
 124             if (exec) Bug8479();
 125             break;
 126         case 19: name = "Bug 7029";
 127             if (exec) Bug7029();
 128             break;
 129         case 20: name = "CheckInvBufSize";
 130             if (exec) CheckInvBufSize();
 131             break;
 132         case 21: name = "Bug 9283";
 133             if (exec) Bug9283();
 134             break;
 135         case 22: name = "Bug10459";
 136             if (exec) Bug10459();
 137             break;
 138
 139         default: name = "";
 140             break; //needed to end loop
 141     }
 142 }
 143
 144
 145
 146 /**
 147  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 148  * into ASCII.
 149  * @see utext_openUTF8
 150  */
 151 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 152
 153 //---------------------------------------------------------------------------
 154 //
 155 //   Error Checking / Reporting macros used in all of the tests.
 156 //
 157 //---------------------------------------------------------------------------
 158
 159 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 160   int64_t oldIndex = utext_getNativeIndex(text);
 161   utext_setNativeIndex(text, 0);
 162   char *bufPtr = buf;
 163   UChar32 c = utext_next32From(text, 0);
 164   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 165     if (0x000020<=c && c<0x00007e) {
 166       *bufPtr = c;
 167     } else {
 168 #if 0
 169       sprintf(bufPtr,"U+%04X", c);
 170       bufPtr+= strlen(bufPtr)-1;
 171 #else
 172       *bufPtr = '%';
 173 #endif
 174     }
 175     bufPtr++;
 176     c = UTEXT_NEXT32(text);
 177   }
 178   *bufPtr = 0;
 179 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 180   char *ebuf = (char*)malloc(bufLen);
 181   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 182   uprv_strncpy(buf, ebuf, bufLen);
 183   free((void*)ebuf);
 184 #endif
 185   utext_setNativeIndex(text, oldIndex);
 186 }
 187
 188
 189 static char ASSERT_BUF[1024];
 190
 191 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 192   if(message.length()==0) {
 193     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 194   } else {
 195     UnicodeString buf;
 196     IntlTest::prettify(message,buf);
 197     if(buf.length()==0) {
 198       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 199     } else {
 200       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 201       if(ASSERT_BUF[0]==0) {
 202         ASSERT_BUF[0]=0;
 203         for(int32_t i=0;i<buf.length();i++) {
 204           UChar ch = buf[i];
 205           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 206         }
 207       }
 208     }
 209   }
 210   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 211   return ASSERT_BUF;
 212 }
 213
 214 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
 215
 216 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 217
 218 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 219                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 220
 221 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 222
 223 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 224 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 225     __LINE__, u_errorName(errcode), u_errorName(status));};}
 226
 227 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 228     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 229
 230 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 231     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 232
 233 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
 234
 235
 236 static UBool testUTextEqual(UText *uta, UText *utb) {
 237     UChar32 ca = 0;
 238     UChar32 cb = 0;
 239     utext_setNativeIndex(uta, 0);
 240     utext_setNativeIndex(utb, 0);
 241     do {
 242         ca = utext_next32(uta);
 243         cb = utext_next32(utb);
 244         if (ca != cb) {
 245             break;
 246         }
 247     } while (ca != U_SENTINEL);
 248     return ca == cb;
 249 }
 250
 251
 252 /**
 253  * @param expected expected text in UTF-8 (not platform) codepage
 254  */
 255 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 256     UErrorCode status = U_ZERO_ERROR;
 257     UText expectedText = UTEXT_INITIALIZER;
 258     utext_openUTF8(&expectedText, expected, -1, &status);
 259     if(U_FAILURE(status)) {
 260       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 261       return;
 262     }
 263     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 264       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 265       return;
 266     }
 267     utext_setNativeIndex(actual, 0);
 268     if (!testUTextEqual(&expectedText, actual)) {
 269         char buf[201 /*21*/];
 270         char expectedBuf[201];
 271         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 272         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 273         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 274     }
 275     utext_close(&expectedText);
 276 }
 277 /**
 278  * @param expected invariant (platform local text) input
 279  */
 280
 281 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 282     UErrorCode status = U_ZERO_ERROR;
 283     UText expectedText = UTEXT_INITIALIZER;
 284     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 285     if(U_FAILURE(status)) {
 286       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 287       return;
 288     }
 289     utext_setNativeIndex(actual, 0);
 290     if (!testUTextEqual(&expectedText, actual)) {
 291         char buf[201 /*21*/];
 292         char expectedBuf[201];
 293         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 294         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 295         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 296     }
 297     utext_close(&expectedText);
 298 }
 299
 300 /**
 301  * Assumes utf-8 input
 302  */
 303 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 304 /**
 305  * Assumes Invariant input
 306  */
 307 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 308
 309 /**
 310  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 311  * passed into utext_openUTF8. An error will be given if
 312  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 313  */
 314
 315 #define INV_BUFSIZ 2048 /* increase this if too small */
 316
 317 static int64_t inv_next=0;
 318
 319 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 320 static char inv_buf[INV_BUFSIZ];
 321 #endif
 322
 323 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 324   if(length==-1) length=strlen(inv);
 325 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 326   inv_next+=length;
 327   return utext_openUTF8(ut, inv, length, status);
 328 #else
 329   if(inv_next+length+1>INV_BUFSIZ) {
 330     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 331             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 332     *status = U_MEMORY_ALLOCATION_ERROR;
 333     return NULL;
 334   }
 335
 336   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 337   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 338   inv_next+=length;
 339
 340 #if 0
 341   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 342 #endif
 343
 344   return utext_openUTF8(ut, (const char*)buf, length, status);
 345 #endif
 346 }
 347
 348
 349 //---------------------------------------------------------------------------
 350 //
 351 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 352 //                       for the LookingAt() and  Match() functions.
 353 //
 354 //       usage:
 355 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 356 //
 357 //          The expected results are UBool - TRUE or FALSE.
 358 //          The input text is unescaped.  The pattern is not.
 359 //
 360 //
 361 //---------------------------------------------------------------------------
 362
 363 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 364
 365 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 366     const UnicodeString pattern(pat, -1, US_INV);
 367     const UnicodeString inputText(text, -1, US_INV);
 368     UErrorCode          status  = U_ZERO_ERROR;
 369     UParseError         pe;
 370     RegexPattern        *REPattern = NULL;
 371     RegexMatcher        *REMatcher = NULL;
 372     UBool               retVal     = TRUE;
 373
 374     UnicodeString patString(pat, -1, US_INV);
 375     REPattern = RegexPattern::compile(patString, 0, pe, status);
 376     if (U_FAILURE(status)) {
 377         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 378             line, u_errorName(status));
 379         return FALSE;
 380     }
 381     if (line==376) { REPattern->dumpPattern();}
 382
 383     UnicodeString inputString(inputText);
 384     UnicodeString unEscapedInput = inputString.unescape();
 385     REMatcher = REPattern->matcher(unEscapedInput, status);
 386     if (U_FAILURE(status)) {
 387         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 388             line, u_errorName(status));
 389         return FALSE;
 390     }
 391
 392     UBool actualmatch;
 393     actualmatch = REMatcher->lookingAt(status);
 394     if (U_FAILURE(status)) {
 395         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 396             line, u_errorName(status));
 397         retVal =  FALSE;
 398     }
 399     if (actualmatch != looking) {
 400         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 401         retVal = FALSE;
 402     }
 403
 404     status = U_ZERO_ERROR;
 405     actualmatch = REMatcher->matches(status);
 406     if (U_FAILURE(status)) {
 407         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 408             line, u_errorName(status));
 409         retVal = FALSE;
 410     }
 411     if (actualmatch != match) {
 412         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 413         retVal = FALSE;
 414     }
 415
 416     if (retVal == FALSE) {
 417         REPattern->dumpPattern();
 418     }
 419
 420     delete REPattern;
 421     delete REMatcher;
 422     return retVal;
 423 }
 424
 425
 426 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 427     UText               pattern    = UTEXT_INITIALIZER;
 428     int32_t             inputUTF8Length;
 429     char                *textChars = NULL;
 430     UText               inputText  = UTEXT_INITIALIZER;
 431     UErrorCode          status     = U_ZERO_ERROR;
 432     UParseError         pe;
 433     RegexPattern        *REPattern = NULL;
 434     RegexMatcher        *REMatcher = NULL;
 435     UBool               retVal     = TRUE;
 436
 437     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 438     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 439     if (U_FAILURE(status)) {
 440         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 441             line, u_errorName(status));
 442         return FALSE;
 443     }
 444
 445     UnicodeString inputString(text, -1, US_INV);
 446     UnicodeString unEscapedInput = inputString.unescape();
 447     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 448     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 449
 450     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 451     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 452         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 453         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 454         return TRUE; // not a failure of the Regex engine
 455     }
 456     status = U_ZERO_ERROR; // buffer overflow
 457     textChars = new char[inputUTF8Length+1];
 458     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 459     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 460
 461     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 462     if (U_FAILURE(status)) {
 463         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 464             line, u_errorName(status));
 465         return FALSE;
 466     }
 467
 468     UBool actualmatch;
 469     actualmatch = REMatcher->lookingAt(status);
 470     if (U_FAILURE(status)) {
 471         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 472             line, u_errorName(status));
 473         retVal =  FALSE;
 474     }
 475     if (actualmatch != looking) {
 476         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 477         retVal = FALSE;
 478     }
 479
 480     status = U_ZERO_ERROR;
 481     actualmatch = REMatcher->matches(status);
 482     if (U_FAILURE(status)) {
 483         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 484             line, u_errorName(status));
 485         retVal = FALSE;
 486     }
 487     if (actualmatch != match) {
 488         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 489         retVal = FALSE;
 490     }
 491
 492     if (retVal == FALSE) {
 493         REPattern->dumpPattern();
 494     }
 495
 496     delete REPattern;
 497     delete REMatcher;
 498     utext_close(&inputText);
 499     utext_close(&pattern);
 500     delete[] textChars;
 501     return retVal;
 502 }
 503
 504
 505
 506 //---------------------------------------------------------------------------
 507 //
 508 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 509 //                       regex tests for incorrect patterns
 510 //
 511 //       usage:
 512 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 513 //
 514 //---------------------------------------------------------------------------
 515 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 516
 517 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 518                           UErrorCode expectedStatus, int32_t line) {
 519     UnicodeString       pattern(pat);
 520
 521     UErrorCode          status         = U_ZERO_ERROR;
 522     UParseError         pe;
 523     RegexPattern        *callerPattern = NULL;
 524
 525     //
 526     //  Compile the caller's pattern
 527     //
 528     UnicodeString patString(pat);
 529     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 530     if (status != expectedStatus) {
 531         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 532     } else {
 533         if (status != U_ZERO_ERROR) {
 534             if (pe.line != errLine || pe.offset != errCol) {
 535                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 536                     line, errLine, errCol, pe.line, pe.offset);
 537             }
 538         }
 539     }
 540
 541     delete callerPattern;
 542
 543     //
 544     //  Compile again, using a UTF-8-based UText
 545     //
 546     UText patternText = UTEXT_INITIALIZER;
 547     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 548     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 549     if (status != expectedStatus) {
 550         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 551     } else {
 552         if (status != U_ZERO_ERROR) {
 553             if (pe.line != errLine || pe.offset != errCol) {
 554                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 555                     line, errLine, errCol, pe.line, pe.offset);
 556             }
 557         }
 558     }
 559
 560     delete callerPattern;
 561     utext_close(&patternText);
 562 }
 563
 564
 565
 566 //---------------------------------------------------------------------------
 567 //
 568 //      Basic      Check for basic functionality of regex pattern matching.
 569 //                 Avoid the use of REGEX_FIND test macro, which has
 570 //                 substantial dependencies on basic Regex functionality.
 571 //
 572 //---------------------------------------------------------------------------
 573 void RegexTest::Basic() {
 574
 575
 576 //
 577 // Debug - slide failing test cases early
 578 //
 579 #if 0
 580     {
 581         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 582         UParseError pe;
 583         UErrorCode  status = U_ZERO_ERROR;
 584         RegexPattern *pattern;
 585         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 586         pattern->dumpPattern();
 587         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 588         UBool result = m->find();
 589         printf("result = %d\n", result);
 590         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 591         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 592     }
 593     exit(1);
 594 #endif
 595
 596
 597     //
 598     // Pattern with parentheses
 599     //
 600     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 601     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 602     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 603
 604     //
 605     // Patterns with *
 606     //
 607     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 608     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 609     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 610     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 611     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 612
 613     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 614     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 615
 616
 617     //
 618     //  Patterns with "."
 619     //
 620     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 621     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 622     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 623     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 624     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 625     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 626     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 627     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 628
 629     //
 630     //  Patterns with * applied to chars at end of literal string
 631     //
 632     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 633     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 634
 635     //
 636     //  Supplemental chars match as single chars, not a pair of surrogates.
 637     //
 638     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 639     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 640     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 641
 642
 643     //
 644     //  UnicodeSets in the pattern
 645     //
 646     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 647     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 648     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 649     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 650     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 651     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 652
 653     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 654     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 655     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 656     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 657     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 658
 659     //
 660     //   OR operator in patterns
 661     //
 662     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 663     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 664     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 665     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 666
 667     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 668     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 669     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 670     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 671     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 672     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 673
 674     //
 675     //  +
 676     //
 677     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 678     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 679     REGEX_TESTLM("b+", "", FALSE, FALSE);
 680     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 681     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 682     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 683
 684     //
 685     //   ?
 686     //
 687     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 688     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 689     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 690     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 691     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 692     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 693     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 694     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 695     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 696
 697     //
 698     //  Escape sequences that become single literal chars, handled internally
 699     //   by ICU's Unescape.
 700     //
 701
 702     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 703     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 704     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 705     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 706     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 707     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 708     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 709     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 710     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 711     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 712
 713     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 714     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 715
 716     // Escape of special chars in patterns
 717     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 718 }
 719
 720
 721 //---------------------------------------------------------------------------
 722 //
 723 //    UTextBasic   Check for quirks that are specific to the UText
 724 //                 implementation.
 725 //
 726 //---------------------------------------------------------------------------
 727 void RegexTest::UTextBasic() {
 728     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 729     UErrorCode status = U_ZERO_ERROR;
 730     UText pattern = UTEXT_INITIALIZER;
 731     utext_openUTF8(&pattern, str_abc, -1, &status);
 732     RegexMatcher matcher(&pattern, 0, status);
 733     REGEX_CHECK_STATUS;
 734
 735     UText input = UTEXT_INITIALIZER;
 736     utext_openUTF8(&input, str_abc, -1, &status);
 737     REGEX_CHECK_STATUS;
 738     matcher.reset(&input);
 739     REGEX_CHECK_STATUS;
 740     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 741
 742     matcher.reset(matcher.inputText());
 743     REGEX_CHECK_STATUS;
 744     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 745
 746     utext_close(&pattern);
 747     utext_close(&input);
 748 }
 749
 750
 751 //---------------------------------------------------------------------------
 752 //
 753 //      API_Match   Test that the API for class RegexMatcher
 754 //                  is present and nominally working, but excluding functions
 755 //                  implementing replace operations.
 756 //
 757 //---------------------------------------------------------------------------
 758 void RegexTest::API_Match() {
 759     UParseError         pe;
 760     UErrorCode          status=U_ZERO_ERROR;
 761     int32_t             flags = 0;
 762
 763     //
 764     // Debug - slide failing test cases early
 765     //
 766 #if 0
 767     {
 768     }
 769     return;
 770 #endif
 771
 772     //
 773     // Simple pattern compilation
 774     //
 775     {
 776         UnicodeString       re("abc");
 777         RegexPattern        *pat2;
 778         pat2 = RegexPattern::compile(re, flags, pe, status);
 779         REGEX_CHECK_STATUS;
 780
 781         UnicodeString inStr1 = "abcdef this is a test";
 782         UnicodeString instr2 = "not abc";
 783         UnicodeString empty  = "";
 784
 785
 786         //
 787         // Matcher creation and reset.
 788         //
 789         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 790         REGEX_CHECK_STATUS;
 791         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 792         REGEX_ASSERT(m1->input() == inStr1);
 793         m1->reset(instr2);
 794         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 795         REGEX_ASSERT(m1->input() == instr2);
 796         m1->reset(inStr1);
 797         REGEX_ASSERT(m1->input() == inStr1);
 798         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 799         m1->reset(empty);
 800         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 801         REGEX_ASSERT(m1->input() == empty);
 802         REGEX_ASSERT(&m1->pattern() == pat2);
 803
 804         //
 805         //  reset(pos, status)
 806         //
 807         m1->reset(inStr1);
 808         m1->reset(4, status);
 809         REGEX_CHECK_STATUS;
 810         REGEX_ASSERT(m1->input() == inStr1);
 811         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 812
 813         m1->reset(-1, status);
 814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 815         status = U_ZERO_ERROR;
 816
 817         m1->reset(0, status);
 818         REGEX_CHECK_STATUS;
 819         status = U_ZERO_ERROR;
 820
 821         int32_t len = m1->input().length();
 822         m1->reset(len-1, status);
 823         REGEX_CHECK_STATUS;
 824         status = U_ZERO_ERROR;
 825
 826         m1->reset(len, status);
 827         REGEX_CHECK_STATUS;
 828         status = U_ZERO_ERROR;
 829
 830         m1->reset(len+1, status);
 831         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 832         status = U_ZERO_ERROR;
 833
 834         //
 835         // match(pos, status)
 836         //
 837         m1->reset(instr2);
 838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 839         m1->reset();
 840         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 841         m1->reset();
 842         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 843         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 844         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 845         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 846
 847         // Match() at end of string should fail, but should not
 848         //  be an error.
 849         status = U_ZERO_ERROR;
 850         len = m1->input().length();
 851         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 852         REGEX_CHECK_STATUS;
 853
 854         // Match beyond end of string should fail with an error.
 855         status = U_ZERO_ERROR;
 856         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 858
 859         // Successful match at end of string.
 860         {
 861             status = U_ZERO_ERROR;
 862             RegexMatcher m("A?", 0, status);  // will match zero length string.
 863             REGEX_CHECK_STATUS;
 864             m.reset(inStr1);
 865             len = inStr1.length();
 866             REGEX_ASSERT(m.matches(len, status) == TRUE);
 867             REGEX_CHECK_STATUS;
 868             m.reset(empty);
 869             REGEX_ASSERT(m.matches(0, status) == TRUE);
 870             REGEX_CHECK_STATUS;
 871         }
 872
 873
 874         //
 875         // lookingAt(pos, status)
 876         //
 877         status = U_ZERO_ERROR;
 878         m1->reset(instr2);  // "not abc"
 879         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 880         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 881         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 882         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 883         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 885         status = U_ZERO_ERROR;
 886         len = m1->input().length();
 887         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 888         REGEX_CHECK_STATUS;
 889         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 890         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 891
 892         delete m1;
 893         delete pat2;
 894     }
 895
 896
 897     //
 898     // Capture Group.
 899     //     RegexMatcher::start();
 900     //     RegexMatcher::end();
 901     //     RegexMatcher::groupCount();
 902     //
 903     {
 904         int32_t             flags=0;
 905         UParseError         pe;
 906         UErrorCode          status=U_ZERO_ERROR;
 907
 908         UnicodeString       re("01(23(45)67)(.*)");
 909         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 910         REGEX_CHECK_STATUS;
 911         UnicodeString data = "0123456789";
 912
 913         RegexMatcher *matcher = pat->matcher(data, status);
 914         REGEX_CHECK_STATUS;
 915         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 916         static const int32_t matchStarts[] = {0,  2, 4, 8};
 917         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 918         int32_t i;
 919         for (i=0; i<4; i++) {
 920             int32_t actualStart = matcher->start(i, status);
 921             REGEX_CHECK_STATUS;
 922             if (actualStart != matchStarts[i]) {
 923                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 924                     __LINE__, i, matchStarts[i], actualStart);
 925             }
 926             int32_t actualEnd = matcher->end(i, status);
 927             REGEX_CHECK_STATUS;
 928             if (actualEnd != matchEnds[i]) {
 929                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 930                     __LINE__, i, matchEnds[i], actualEnd);
 931             }
 932         }
 933
 934         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 935         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 936
 937         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 938         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 939         matcher->reset();
 940         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 941
 942         matcher->lookingAt(status);
 943         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 944         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 945         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 946         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 947         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 948         REGEX_CHECK_STATUS;
 949         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 950         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 951         matcher->reset();
 952         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 953
 954         delete matcher;
 955         delete pat;
 956
 957     }
 958
 959     //
 960     //  find
 961     //
 962     {
 963         int32_t             flags=0;
 964         UParseError         pe;
 965         UErrorCode          status=U_ZERO_ERROR;
 966
 967         UnicodeString       re("abc");
 968         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 969         REGEX_CHECK_STATUS;
 970         UnicodeString data = ".abc..abc...abc..";
 971         //                    012345678901234567
 972
 973         RegexMatcher *matcher = pat->matcher(data, status);
 974         REGEX_CHECK_STATUS;
 975         REGEX_ASSERT(matcher->find());
 976         REGEX_ASSERT(matcher->start(status) == 1);
 977         REGEX_ASSERT(matcher->find());
 978         REGEX_ASSERT(matcher->start(status) == 6);
 979         REGEX_ASSERT(matcher->find());
 980         REGEX_ASSERT(matcher->start(status) == 12);
 981         REGEX_ASSERT(matcher->find() == FALSE);
 982         REGEX_ASSERT(matcher->find() == FALSE);
 983
 984         matcher->reset();
 985         REGEX_ASSERT(matcher->find());
 986         REGEX_ASSERT(matcher->start(status) == 1);
 987
 988         REGEX_ASSERT(matcher->find(0, status));
 989         REGEX_ASSERT(matcher->start(status) == 1);
 990         REGEX_ASSERT(matcher->find(1, status));
 991         REGEX_ASSERT(matcher->start(status) == 1);
 992         REGEX_ASSERT(matcher->find(2, status));
 993         REGEX_ASSERT(matcher->start(status) == 6);
 994         REGEX_ASSERT(matcher->find(12, status));
 995         REGEX_ASSERT(matcher->start(status) == 12);
 996         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 997         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 998         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 999         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1000
1001         status = U_ZERO_ERROR;
1002         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1003         status = U_ZERO_ERROR;
1004         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1005
1006         REGEX_ASSERT(matcher->groupCount() == 0);
1007
1008         delete matcher;
1009         delete pat;
1010     }
1011
1012
1013     //
1014     //  find, with \G in pattern (true if at the end of a previous match).
1015     //
1016     {
1017         int32_t             flags=0;
1018         UParseError         pe;
1019         UErrorCode          status=U_ZERO_ERROR;
1020
1021         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1022         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1023         REGEX_CHECK_STATUS;
1024         UnicodeString data = ".abcabc.abc..";
1025         //                    012345678901234567
1026
1027         RegexMatcher *matcher = pat->matcher(data, status);
1028         REGEX_CHECK_STATUS;
1029         REGEX_ASSERT(matcher->find());
1030         REGEX_ASSERT(matcher->start(status) == 0);
1031         REGEX_ASSERT(matcher->start(1, status) == -1);
1032         REGEX_ASSERT(matcher->start(2, status) == 1);
1033
1034         REGEX_ASSERT(matcher->find());
1035         REGEX_ASSERT(matcher->start(status) == 4);
1036         REGEX_ASSERT(matcher->start(1, status) == 4);
1037         REGEX_ASSERT(matcher->start(2, status) == -1);
1038         REGEX_CHECK_STATUS;
1039
1040         delete matcher;
1041         delete pat;
1042     }
1043
1044     //
1045     //   find with zero length matches, match position should bump ahead
1046     //     to prevent loops.
1047     //
1048     {
1049         int32_t                 i;
1050         UErrorCode          status=U_ZERO_ERROR;
1051         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1052                                                       //   using an always-true look-ahead.
1053         REGEX_CHECK_STATUS;
1054         UnicodeString s("    ");
1055         m.reset(s);
1056         for (i=0; ; i++) {
1057             if (m.find() == FALSE) {
1058                 break;
1059             }
1060             REGEX_ASSERT(m.start(status) == i);
1061             REGEX_ASSERT(m.end(status) == i);
1062         }
1063         REGEX_ASSERT(i==5);
1064
1065         // Check that the bump goes over surrogate pairs OK
1066         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1067         s = s.unescape();
1068         m.reset(s);
1069         for (i=0; ; i+=2) {
1070             if (m.find() == FALSE) {
1071                 break;
1072             }
1073             REGEX_ASSERT(m.start(status) == i);
1074             REGEX_ASSERT(m.end(status) == i);
1075         }
1076         REGEX_ASSERT(i==10);
1077     }
1078     {
1079         // find() loop breaking test.
1080         //        with pattern of /.?/, should see a series of one char matches, then a single
1081         //        match of zero length at the end of the input string.
1082         int32_t                 i;
1083         UErrorCode          status=U_ZERO_ERROR;
1084         RegexMatcher        m(".?", 0, status);
1085         REGEX_CHECK_STATUS;
1086         UnicodeString s("    ");
1087         m.reset(s);
1088         for (i=0; ; i++) {
1089             if (m.find() == FALSE) {
1090                 break;
1091             }
1092             REGEX_ASSERT(m.start(status) == i);
1093             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1094         }
1095         REGEX_ASSERT(i==5);
1096     }
1097
1098
1099     //
1100     // Matchers with no input string behave as if they had an empty input string.
1101     //
1102
1103     {
1104         UErrorCode status = U_ZERO_ERROR;
1105         RegexMatcher  m(".?", 0, status);
1106         REGEX_CHECK_STATUS;
1107         REGEX_ASSERT(m.find());
1108         REGEX_ASSERT(m.start(status) == 0);
1109         REGEX_ASSERT(m.input() == "");
1110     }
1111     {
1112         UErrorCode status = U_ZERO_ERROR;
1113         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1114         RegexMatcher  *m = p->matcher(status);
1115         REGEX_CHECK_STATUS;
1116
1117         REGEX_ASSERT(m->find() == FALSE);
1118         REGEX_ASSERT(m->input() == "");
1119         delete m;
1120         delete p;
1121     }
1122
1123     //
1124     // Regions
1125     //
1126     {
1127         UErrorCode status = U_ZERO_ERROR;
1128         UnicodeString testString("This is test data");
1129         RegexMatcher m(".*", testString,  0, status);
1130         REGEX_CHECK_STATUS;
1131         REGEX_ASSERT(m.regionStart() == 0);
1132         REGEX_ASSERT(m.regionEnd() == testString.length());
1133         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1134         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1135
1136         m.region(2,4, status);
1137         REGEX_CHECK_STATUS;
1138         REGEX_ASSERT(m.matches(status));
1139         REGEX_ASSERT(m.start(status)==2);
1140         REGEX_ASSERT(m.end(status)==4);
1141         REGEX_CHECK_STATUS;
1142
1143         m.reset();
1144         REGEX_ASSERT(m.regionStart() == 0);
1145         REGEX_ASSERT(m.regionEnd() == testString.length());
1146
1147         UnicodeString shorterString("short");
1148         m.reset(shorterString);
1149         REGEX_ASSERT(m.regionStart() == 0);
1150         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1151
1152         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1154         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1155         REGEX_ASSERT(&m == &m.reset());
1156         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1157
1158         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1159         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1160         REGEX_ASSERT(&m == &m.reset());
1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1164         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1165         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1166         REGEX_ASSERT(&m == &m.reset());
1167         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1168
1169         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1170         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1171         REGEX_ASSERT(&m == &m.reset());
1172         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1173
1174     }
1175
1176     //
1177     // hitEnd() and requireEnd()
1178     //
1179     {
1180         UErrorCode status = U_ZERO_ERROR;
1181         UnicodeString testString("aabb");
1182         RegexMatcher m1(".*", testString,  0, status);
1183         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1184         REGEX_ASSERT(m1.hitEnd() == TRUE);
1185         REGEX_ASSERT(m1.requireEnd() == FALSE);
1186         REGEX_CHECK_STATUS;
1187
1188         status = U_ZERO_ERROR;
1189         RegexMatcher m2("a*", testString, 0, status);
1190         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1191         REGEX_ASSERT(m2.hitEnd() == FALSE);
1192         REGEX_ASSERT(m2.requireEnd() == FALSE);
1193         REGEX_CHECK_STATUS;
1194
1195         status = U_ZERO_ERROR;
1196         RegexMatcher m3(".*$", testString, 0, status);
1197         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1198         REGEX_ASSERT(m3.hitEnd() == TRUE);
1199         REGEX_ASSERT(m3.requireEnd() == TRUE);
1200         REGEX_CHECK_STATUS;
1201     }
1202
1203
1204     //
1205     // Compilation error on reset with UChar *
1206     //   These were a hazard that people were stumbling over with runtime errors.
1207     //   Changed them to compiler errors by adding private methods that more closely
1208     //   matched the incorrect use of the functions.
1209     //
1210 #if 0
1211     {
1212         UErrorCode status = U_ZERO_ERROR;
1213         UChar ucharString[20];
1214         RegexMatcher m(".", 0, status);
1215         m.reset(ucharString);  // should not compile.
1216
1217         RegexPattern *p = RegexPattern::compile(".", 0, status);
1218         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1219
1220         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1221     }
1222 #endif
1223
1224     //
1225     //  Time Outs.
1226     //       Note:  These tests will need to be changed when the regexp engine is
1227     //              able to detect and cut short the exponential time behavior on
1228     //              this type of match.
1229     //
1230     {
1231         UErrorCode status = U_ZERO_ERROR;
1232         //    Enough 'a's in the string to cause the match to time out.
1233         //       (Each on additonal 'a' doubles the time)
1234         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1235         RegexMatcher matcher("(a+)+b", testString, 0, status);
1236         REGEX_CHECK_STATUS;
1237         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1238         matcher.setTimeLimit(100, status);
1239         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1240         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1241         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1242     }
1243     {
1244         UErrorCode status = U_ZERO_ERROR;
1245         //   Few enough 'a's to slip in under the time limit.
1246         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1247         RegexMatcher matcher("(a+)+b", testString, 0, status);
1248         REGEX_CHECK_STATUS;
1249         matcher.setTimeLimit(100, status);
1250         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1251         REGEX_CHECK_STATUS;
1252     }
1253
1254     //
1255     //  Stack Limits
1256     //
1257     {
1258         UErrorCode status = U_ZERO_ERROR;
1259         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1260
1261         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1262         //   of the '+', and makes the stack frames larger.
1263         RegexMatcher matcher("(A)+A$", testString, 0, status);
1264
1265         // With the default stack, this match should fail to run
1266         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1267         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1268
1269         // With unlimited stack, it should run
1270         status = U_ZERO_ERROR;
1271         matcher.setStackLimit(0, status);
1272         REGEX_CHECK_STATUS;
1273         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1274         REGEX_CHECK_STATUS;
1275         REGEX_ASSERT(matcher.getStackLimit() == 0);
1276
1277         // With a limited stack, it the match should fail
1278         status = U_ZERO_ERROR;
1279         matcher.setStackLimit(10000, status);
1280         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1281         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1282         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1283     }
1284
1285         // A pattern that doesn't save state should work with
1286         //   a minimal sized stack
1287     {
1288         UErrorCode status = U_ZERO_ERROR;
1289         UnicodeString testString = "abc";
1290         RegexMatcher matcher("abc", testString, 0, status);
1291         REGEX_CHECK_STATUS;
1292         matcher.setStackLimit(30, status);
1293         REGEX_CHECK_STATUS;
1294         REGEX_ASSERT(matcher.matches(status) == TRUE);
1295         REGEX_CHECK_STATUS;
1296         REGEX_ASSERT(matcher.getStackLimit() == 30);
1297
1298         // Negative stack sizes should fail
1299         status = U_ZERO_ERROR;
1300         matcher.setStackLimit(1000, status);
1301         REGEX_CHECK_STATUS;
1302         matcher.setStackLimit(-1, status);
1303         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1304         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1305     }
1306
1307
1308 }
1309
1310
1311
1312
1313
1314
1315 //---------------------------------------------------------------------------
1316 //
1317 //      API_Replace        API test for class RegexMatcher, testing the
1318 //                         Replace family of functions.
1319 //
1320 //---------------------------------------------------------------------------
1321 void RegexTest::API_Replace() {
1322     //
1323     //  Replace
1324     //
1325     int32_t             flags=0;
1326     UParseError         pe;
1327     UErrorCode          status=U_ZERO_ERROR;
1328
1329     UnicodeString       re("abc");
1330     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1331     REGEX_CHECK_STATUS;
1332     UnicodeString data = ".abc..abc...abc..";
1333     //                    012345678901234567
1334     RegexMatcher *matcher = pat->matcher(data, status);
1335
1336     //
1337     //  Plain vanilla matches.
1338     //
1339     UnicodeString  dest;
1340     dest = matcher->replaceFirst("yz", status);
1341     REGEX_CHECK_STATUS;
1342     REGEX_ASSERT(dest == ".yz..abc...abc..");
1343
1344     dest = matcher->replaceAll("yz", status);
1345     REGEX_CHECK_STATUS;
1346     REGEX_ASSERT(dest == ".yz..yz...yz..");
1347
1348     //
1349     //  Plain vanilla non-matches.
1350     //
1351     UnicodeString d2 = ".abx..abx...abx..";
1352     matcher->reset(d2);
1353     dest = matcher->replaceFirst("yz", status);
1354     REGEX_CHECK_STATUS;
1355     REGEX_ASSERT(dest == ".abx..abx...abx..");
1356
1357     dest = matcher->replaceAll("yz", status);
1358     REGEX_CHECK_STATUS;
1359     REGEX_ASSERT(dest == ".abx..abx...abx..");
1360
1361     //
1362     // Empty source string
1363     //
1364     UnicodeString d3 = "";
1365     matcher->reset(d3);
1366     dest = matcher->replaceFirst("yz", status);
1367     REGEX_CHECK_STATUS;
1368     REGEX_ASSERT(dest == "");
1369
1370     dest = matcher->replaceAll("yz", status);
1371     REGEX_CHECK_STATUS;
1372     REGEX_ASSERT(dest == "");
1373
1374     //
1375     // Empty substitution string
1376     //
1377     matcher->reset(data);              // ".abc..abc...abc.."
1378     dest = matcher->replaceFirst("", status);
1379     REGEX_CHECK_STATUS;
1380     REGEX_ASSERT(dest == "...abc...abc..");
1381
1382     dest = matcher->replaceAll("", status);
1383     REGEX_CHECK_STATUS;
1384     REGEX_ASSERT(dest == "........");
1385
1386     //
1387     // match whole string
1388     //
1389     UnicodeString d4 = "abc";
1390     matcher->reset(d4);
1391     dest = matcher->replaceFirst("xyz", status);
1392     REGEX_CHECK_STATUS;
1393     REGEX_ASSERT(dest == "xyz");
1394
1395     dest = matcher->replaceAll("xyz", status);
1396     REGEX_CHECK_STATUS;
1397     REGEX_ASSERT(dest == "xyz");
1398
1399     //
1400     // Capture Group, simple case
1401     //
1402     UnicodeString       re2("a(..)");
1403     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1404     REGEX_CHECK_STATUS;
1405     UnicodeString d5 = "abcdefg";
1406     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1407     REGEX_CHECK_STATUS;
1408     dest = matcher2->replaceFirst("$1$1", status);
1409     REGEX_CHECK_STATUS;
1410     REGEX_ASSERT(dest == "bcbcdefg");
1411
1412     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1413     REGEX_CHECK_STATUS;
1414     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1415
1416     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1417     REGEX_CHECK_STATUS;
1418     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1419
1420     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1421     replacement = replacement.unescape();
1422     dest = matcher2->replaceFirst(replacement, status);
1423     REGEX_CHECK_STATUS;
1424     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1425
1426     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1427
1428
1429     //
1430     // Replacement String with \u hex escapes
1431     //
1432     {
1433         UnicodeString  src = "abc 1 abc 2 abc 3";
1434         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1435         matcher->reset(src);
1436         UnicodeString  result = matcher->replaceAll(substitute, status);
1437         REGEX_CHECK_STATUS;
1438         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1439     }
1440     {
1441         UnicodeString  src = "abc !";
1442         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1443         matcher->reset(src);
1444         UnicodeString  result = matcher->replaceAll(substitute, status);
1445         REGEX_CHECK_STATUS;
1446         UnicodeString expected = UnicodeString("--");
1447         expected.append((UChar32)0x10000);
1448         expected.append("-- !");
1449         REGEX_ASSERT(result == expected);
1450     }
1451     // TODO:  need more through testing of capture substitutions.
1452
1453     // Bug 4057
1454     //
1455     {
1456         status = U_ZERO_ERROR;
1457         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1458         RegexMatcher m("ss(.*?)ee", 0, status);
1459         REGEX_CHECK_STATUS;
1460         UnicodeString result;
1461
1462         // Multiple finds do NOT bump up the previous appendReplacement postion.
1463         m.reset(s);
1464         m.find();
1465         m.find();
1466         m.appendReplacement(result, "ooh", status);
1467         REGEX_CHECK_STATUS;
1468         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1469
1470         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1471         status = U_ZERO_ERROR;
1472         result.truncate(0);
1473         m.reset(10, status);
1474         m.find();
1475         m.find();
1476         m.appendReplacement(result, "ooh", status);
1477         REGEX_CHECK_STATUS;
1478         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1479
1480         // find() at interior of string, appendReplacemnt still starts at beginning.
1481         status = U_ZERO_ERROR;
1482         result.truncate(0);
1483         m.reset();
1484         m.find(10, status);
1485         m.find();
1486         m.appendReplacement(result, "ooh", status);
1487         REGEX_CHECK_STATUS;
1488         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1489
1490         m.appendTail(result);
1491         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1492
1493     }
1494
1495     delete matcher2;
1496     delete pat2;
1497     delete matcher;
1498     delete pat;
1499 }
1500
1501
1502 //---------------------------------------------------------------------------
1503 //
1504 //      API_Pattern       Test that the API for class RegexPattern is
1505 //                        present and nominally working.
1506 //
1507 //---------------------------------------------------------------------------
1508 void RegexTest::API_Pattern() {
1509     RegexPattern        pata;    // Test default constructor to not crash.
1510     RegexPattern        patb;
1511
1512     REGEX_ASSERT(pata == patb);
1513     REGEX_ASSERT(pata == pata);
1514
1515     UnicodeString re1("abc[a-l][m-z]");
1516     UnicodeString re2("def");
1517     UErrorCode    status = U_ZERO_ERROR;
1518     UParseError   pe;
1519
1520     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1521     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1522     REGEX_CHECK_STATUS;
1523     REGEX_ASSERT(*pat1 == *pat1);
1524     REGEX_ASSERT(*pat1 != pata);
1525
1526     // Assign
1527     patb = *pat1;
1528     REGEX_ASSERT(patb == *pat1);
1529
1530     // Copy Construct
1531     RegexPattern patc(*pat1);
1532     REGEX_ASSERT(patc == *pat1);
1533     REGEX_ASSERT(patb == patc);
1534     REGEX_ASSERT(pat1 != pat2);
1535     patb = *pat2;
1536     REGEX_ASSERT(patb != patc);
1537     REGEX_ASSERT(patb == *pat2);
1538
1539     // Compile with no flags.
1540     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1541     REGEX_ASSERT(*pat1a == *pat1);
1542
1543     REGEX_ASSERT(pat1a->flags() == 0);
1544
1545     // Compile with different flags should be not equal
1546     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1547     REGEX_CHECK_STATUS;
1548
1549     REGEX_ASSERT(*pat1b != *pat1a);
1550     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1551     REGEX_ASSERT(pat1a->flags() == 0);
1552     delete pat1b;
1553
1554     // clone
1555     RegexPattern *pat1c = pat1->clone();
1556     REGEX_ASSERT(*pat1c == *pat1);
1557     REGEX_ASSERT(*pat1c != *pat2);
1558
1559     delete pat1c;
1560     delete pat1a;
1561     delete pat1;
1562     delete pat2;
1563
1564
1565     //
1566     //   Verify that a matcher created from a cloned pattern works.
1567     //     (Jitterbug 3423)
1568     //
1569     {
1570         UErrorCode     status     = U_ZERO_ERROR;
1571         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1572         RegexPattern  *pClone     = pSource->clone();
1573         delete         pSource;
1574         RegexMatcher  *mFromClone = pClone->matcher(status);
1575         REGEX_CHECK_STATUS;
1576         UnicodeString s = "Hello World";
1577         mFromClone->reset(s);
1578         REGEX_ASSERT(mFromClone->find() == TRUE);
1579         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1580         REGEX_ASSERT(mFromClone->find() == TRUE);
1581         REGEX_ASSERT(mFromClone->group(status) == "World");
1582         REGEX_ASSERT(mFromClone->find() == FALSE);
1583         delete mFromClone;
1584         delete pClone;
1585     }
1586
1587     //
1588     //   matches convenience API
1589     //
1590     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1591     REGEX_CHECK_STATUS;
1592     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1593     REGEX_CHECK_STATUS;
1594     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1595     REGEX_CHECK_STATUS;
1596     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1597     REGEX_CHECK_STATUS;
1598     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1599     REGEX_CHECK_STATUS;
1600     status = U_INDEX_OUTOFBOUNDS_ERROR;
1601     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1602     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1603
1604
1605     //
1606     // Split()
1607     //
1608     status = U_ZERO_ERROR;
1609     pat1 = RegexPattern::compile(" +",  pe, status);
1610     REGEX_CHECK_STATUS;
1611     UnicodeString  fields[10];
1612
1613     int32_t n;
1614     n = pat1->split("Now is the time", fields, 10, status);
1615     REGEX_CHECK_STATUS;
1616     REGEX_ASSERT(n==4);
1617     REGEX_ASSERT(fields[0]=="Now");
1618     REGEX_ASSERT(fields[1]=="is");
1619     REGEX_ASSERT(fields[2]=="the");
1620     REGEX_ASSERT(fields[3]=="time");
1621     REGEX_ASSERT(fields[4]=="");
1622
1623     n = pat1->split("Now is the time", fields, 2, status);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(n==2);
1626     REGEX_ASSERT(fields[0]=="Now");
1627     REGEX_ASSERT(fields[1]=="is the time");
1628     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1629
1630     fields[1] = "*";
1631     status = U_ZERO_ERROR;
1632     n = pat1->split("Now is the time", fields, 1, status);
1633     REGEX_CHECK_STATUS;
1634     REGEX_ASSERT(n==1);
1635     REGEX_ASSERT(fields[0]=="Now is the time");
1636     REGEX_ASSERT(fields[1]=="*");
1637     status = U_ZERO_ERROR;
1638
1639     n = pat1->split("    Now       is the time   ", fields, 10, status);
1640     REGEX_CHECK_STATUS;
1641     REGEX_ASSERT(n==6);
1642     REGEX_ASSERT(fields[0]=="");
1643     REGEX_ASSERT(fields[1]=="Now");
1644     REGEX_ASSERT(fields[2]=="is");
1645     REGEX_ASSERT(fields[3]=="the");
1646     REGEX_ASSERT(fields[4]=="time");
1647     REGEX_ASSERT(fields[5]=="");
1648
1649     n = pat1->split("     ", fields, 10, status);
1650     REGEX_CHECK_STATUS;
1651     REGEX_ASSERT(n==2);
1652     REGEX_ASSERT(fields[0]=="");
1653     REGEX_ASSERT(fields[1]=="");
1654
1655     fields[0] = "foo";
1656     n = pat1->split("", fields, 10, status);
1657     REGEX_CHECK_STATUS;
1658     REGEX_ASSERT(n==0);
1659     REGEX_ASSERT(fields[0]=="foo");
1660
1661     delete pat1;
1662
1663     //  split, with a pattern with (capture)
1664     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1665     REGEX_CHECK_STATUS;
1666
1667     status = U_ZERO_ERROR;
1668     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1669     REGEX_CHECK_STATUS;
1670     REGEX_ASSERT(n==7);
1671     REGEX_ASSERT(fields[0]=="");
1672     REGEX_ASSERT(fields[1]=="a");
1673     REGEX_ASSERT(fields[2]=="Now is ");
1674     REGEX_ASSERT(fields[3]=="b");
1675     REGEX_ASSERT(fields[4]=="the time");
1676     REGEX_ASSERT(fields[5]=="c");
1677     REGEX_ASSERT(fields[6]=="");
1678     REGEX_ASSERT(status==U_ZERO_ERROR);
1679
1680     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1681     REGEX_CHECK_STATUS;
1682     REGEX_ASSERT(n==7);
1683     REGEX_ASSERT(fields[0]=="  ");
1684     REGEX_ASSERT(fields[1]=="a");
1685     REGEX_ASSERT(fields[2]=="Now is ");
1686     REGEX_ASSERT(fields[3]=="b");
1687     REGEX_ASSERT(fields[4]=="the time");
1688     REGEX_ASSERT(fields[5]=="c");
1689     REGEX_ASSERT(fields[6]=="");
1690
1691     status = U_ZERO_ERROR;
1692     fields[6] = "foo";
1693     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1694     REGEX_CHECK_STATUS;
1695     REGEX_ASSERT(n==6);
1696     REGEX_ASSERT(fields[0]=="  ");
1697     REGEX_ASSERT(fields[1]=="a");
1698     REGEX_ASSERT(fields[2]=="Now is ");
1699     REGEX_ASSERT(fields[3]=="b");
1700     REGEX_ASSERT(fields[4]=="the time");
1701     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1702     REGEX_ASSERT(fields[6]=="foo");
1703
1704     status = U_ZERO_ERROR;
1705     fields[5] = "foo";
1706     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1707     REGEX_CHECK_STATUS;
1708     REGEX_ASSERT(n==5);
1709     REGEX_ASSERT(fields[0]=="  ");
1710     REGEX_ASSERT(fields[1]=="a");
1711     REGEX_ASSERT(fields[2]=="Now is ");
1712     REGEX_ASSERT(fields[3]=="b");
1713     REGEX_ASSERT(fields[4]=="the time<c>");
1714     REGEX_ASSERT(fields[5]=="foo");
1715
1716     status = U_ZERO_ERROR;
1717     fields[5] = "foo";
1718     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1719     REGEX_CHECK_STATUS;
1720     REGEX_ASSERT(n==5);
1721     REGEX_ASSERT(fields[0]=="  ");
1722     REGEX_ASSERT(fields[1]=="a");
1723     REGEX_ASSERT(fields[2]=="Now is ");
1724     REGEX_ASSERT(fields[3]=="b");
1725     REGEX_ASSERT(fields[4]=="the time");
1726     REGEX_ASSERT(fields[5]=="foo");
1727
1728     status = U_ZERO_ERROR;
1729     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1730     REGEX_CHECK_STATUS;
1731     REGEX_ASSERT(n==4);
1732     REGEX_ASSERT(fields[0]=="  ");
1733     REGEX_ASSERT(fields[1]=="a");
1734     REGEX_ASSERT(fields[2]=="Now is ");
1735     REGEX_ASSERT(fields[3]=="the time<c>");
1736     status = U_ZERO_ERROR;
1737     delete pat1;
1738
1739     pat1 = RegexPattern::compile("([-,])",  pe, status);
1740     REGEX_CHECK_STATUS;
1741     n = pat1->split("1-10,20", fields, 10, status);
1742     REGEX_CHECK_STATUS;
1743     REGEX_ASSERT(n==5);
1744     REGEX_ASSERT(fields[0]=="1");
1745     REGEX_ASSERT(fields[1]=="-");
1746     REGEX_ASSERT(fields[2]=="10");
1747     REGEX_ASSERT(fields[3]==",");
1748     REGEX_ASSERT(fields[4]=="20");
1749     delete pat1;
1750
1751     // Test split of string with empty trailing fields
1752     pat1 = RegexPattern::compile(",", pe, status);
1753     REGEX_CHECK_STATUS;
1754     n = pat1->split("a,b,c,", fields, 10, status);
1755     REGEX_CHECK_STATUS;
1756     REGEX_ASSERT(n==4);
1757     REGEX_ASSERT(fields[0]=="a");
1758     REGEX_ASSERT(fields[1]=="b");
1759     REGEX_ASSERT(fields[2]=="c");
1760     REGEX_ASSERT(fields[3]=="");
1761
1762     n = pat1->split("a,,,", fields, 10, status);
1763     REGEX_CHECK_STATUS;
1764     REGEX_ASSERT(n==4);
1765     REGEX_ASSERT(fields[0]=="a");
1766     REGEX_ASSERT(fields[1]=="");
1767     REGEX_ASSERT(fields[2]=="");
1768     REGEX_ASSERT(fields[3]=="");
1769     delete pat1;
1770
1771     // Split Separator with zero length match.
1772     pat1 = RegexPattern::compile(":?", pe, status);
1773     REGEX_CHECK_STATUS;
1774     n = pat1->split("abc", fields, 10, status);
1775     REGEX_CHECK_STATUS;
1776     REGEX_ASSERT(n==5);
1777     REGEX_ASSERT(fields[0]=="");
1778     REGEX_ASSERT(fields[1]=="a");
1779     REGEX_ASSERT(fields[2]=="b");
1780     REGEX_ASSERT(fields[3]=="c");
1781     REGEX_ASSERT(fields[4]=="");
1782
1783     delete pat1;
1784
1785     //
1786     // RegexPattern::pattern()
1787     //
1788     pat1 = new RegexPattern();
1789     REGEX_ASSERT(pat1->pattern() == "");
1790     delete pat1;
1791
1792     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1793     REGEX_CHECK_STATUS;
1794     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1795     delete pat1;
1796
1797
1798     //
1799     // classID functions
1800     //
1801     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1802     REGEX_CHECK_STATUS;
1803     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1804     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1805     UnicodeString Hello("Hello, world.");
1806     RegexMatcher *m = pat1->matcher(Hello, status);
1807     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1808     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1809     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1810     delete m;
1811     delete pat1;
1812
1813 }
1814
1815 //---------------------------------------------------------------------------
1816 //
1817 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1818 //                       is present and working, but excluding functions
1819 //                       implementing replace operations.
1820 //
1821 //---------------------------------------------------------------------------
1822 void RegexTest::API_Match_UTF8() {
1823     UParseError         pe;
1824     UErrorCode          status=U_ZERO_ERROR;
1825     int32_t             flags = 0;
1826
1827     //
1828     // Debug - slide failing test cases early
1829     //
1830 #if 0
1831     {
1832     }
1833     return;
1834 #endif
1835
1836     //
1837     // Simple pattern compilation
1838     //
1839     {
1840         UText               re = UTEXT_INITIALIZER;
1841         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1842         REGEX_VERBOSE_TEXT(&re);
1843         RegexPattern        *pat2;
1844         pat2 = RegexPattern::compile(&re, flags, pe, status);
1845         REGEX_CHECK_STATUS;
1846
1847         UText input1 = UTEXT_INITIALIZER;
1848         UText input2 = UTEXT_INITIALIZER;
1849         UText empty  = UTEXT_INITIALIZER;
1850         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1851         REGEX_VERBOSE_TEXT(&input1);
1852         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1853         REGEX_VERBOSE_TEXT(&input2);
1854         utext_openUChars(&empty, NULL, 0, &status);
1855
1856         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1857         int32_t input2Len = strlen("not abc");
1858
1859
1860         //
1861         // Matcher creation and reset.
1862         //
1863         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1864         REGEX_CHECK_STATUS;
1865         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1866         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1867         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1868         m1->reset(&input2);
1869         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1870         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1871         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1872         m1->reset(&input1);
1873         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1874         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1875         m1->reset(&empty);
1876         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1877         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1878
1879         //
1880         //  reset(pos, status)
1881         //
1882         m1->reset(&input1);
1883         m1->reset(4, status);
1884         REGEX_CHECK_STATUS;
1885         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1886         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1887
1888         m1->reset(-1, status);
1889         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1890         status = U_ZERO_ERROR;
1891
1892         m1->reset(0, status);
1893         REGEX_CHECK_STATUS;
1894         status = U_ZERO_ERROR;
1895
1896         m1->reset(input1Len-1, status);
1897         REGEX_CHECK_STATUS;
1898         status = U_ZERO_ERROR;
1899
1900         m1->reset(input1Len, status);
1901         REGEX_CHECK_STATUS;
1902         status = U_ZERO_ERROR;
1903
1904         m1->reset(input1Len+1, status);
1905         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1906         status = U_ZERO_ERROR;
1907
1908         //
1909         // match(pos, status)
1910         //
1911         m1->reset(&input2);
1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913         m1->reset();
1914         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1915         m1->reset();
1916         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1917         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1918         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1919         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1920
1921         // Match() at end of string should fail, but should not
1922         //  be an error.
1923         status = U_ZERO_ERROR;
1924         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1925         REGEX_CHECK_STATUS;
1926
1927         // Match beyond end of string should fail with an error.
1928         status = U_ZERO_ERROR;
1929         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1930         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932         // Successful match at end of string.
1933         {
1934             status = U_ZERO_ERROR;
1935             RegexMatcher m("A?", 0, status);  // will match zero length string.
1936             REGEX_CHECK_STATUS;
1937             m.reset(&input1);
1938             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1939             REGEX_CHECK_STATUS;
1940             m.reset(&empty);
1941             REGEX_ASSERT(m.matches(0, status) == TRUE);
1942             REGEX_CHECK_STATUS;
1943         }
1944
1945
1946         //
1947         // lookingAt(pos, status)
1948         //
1949         status = U_ZERO_ERROR;
1950         m1->reset(&input2);  // "not abc"
1951         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1952         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1953         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1954         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1955         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957         status = U_ZERO_ERROR;
1958         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1959         REGEX_CHECK_STATUS;
1960         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1961         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1962
1963         delete m1;
1964         delete pat2;
1965
1966         utext_close(&re);
1967         utext_close(&input1);
1968         utext_close(&input2);
1969         utext_close(&empty);
1970     }
1971
1972
1973     //
1974     // Capture Group.
1975     //     RegexMatcher::start();
1976     //     RegexMatcher::end();
1977     //     RegexMatcher::groupCount();
1978     //
1979     {
1980         int32_t             flags=0;
1981         UParseError         pe;
1982         UErrorCode          status=U_ZERO_ERROR;
1983         UText               re=UTEXT_INITIALIZER;
1984         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1985         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1986
1987         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1988         REGEX_CHECK_STATUS;
1989
1990         UText input = UTEXT_INITIALIZER;
1991         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1992         utext_openUTF8(&input, str_0123456789, -1, &status);
1993
1994         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1995         REGEX_CHECK_STATUS;
1996         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1997         static const int32_t matchStarts[] = {0,  2, 4, 8};
1998         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1999         int32_t i;
2000         for (i=0; i<4; i++) {
2001             int32_t actualStart = matcher->start(i, status);
2002             REGEX_CHECK_STATUS;
2003             if (actualStart != matchStarts[i]) {
2004                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2005                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2006             }
2007             int32_t actualEnd = matcher->end(i, status);
2008             REGEX_CHECK_STATUS;
2009             if (actualEnd != matchEnds[i]) {
2010                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2011                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2012             }
2013         }
2014
2015         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2016         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2017
2018         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2019         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020         matcher->reset();
2021         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2022
2023         matcher->lookingAt(status);
2024
2025         UnicodeString dest;
2026         UText destText = UTEXT_INITIALIZER;
2027         utext_openUnicodeString(&destText, &dest, &status);
2028         UText *result;
2029         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2030         //      Test shallow-clone API
2031         int64_t   group_len;
2032         result = matcher->group((UText *)NULL, group_len, status);
2033         REGEX_CHECK_STATUS;
2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035         utext_close(result);
2036         result = matcher->group(0, &destText, group_len, status);
2037         REGEX_CHECK_STATUS;
2038         REGEX_ASSERT(result == &destText);
2039         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2040         //  destText is now immutable, reopen it
2041         utext_close(&destText);
2042         utext_openUnicodeString(&destText, &dest, &status);
2043
2044         result = matcher->group(0, NULL, status);
2045         REGEX_CHECK_STATUS;
2046         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047         utext_close(result);
2048         result = matcher->group(0, &destText, status);
2049         REGEX_CHECK_STATUS;
2050         REGEX_ASSERT(result == &destText);
2051         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2052
2053         result = matcher->group(1, NULL, status);
2054         REGEX_CHECK_STATUS;
2055         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2056         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057         utext_close(result);
2058         result = matcher->group(1, &destText, status);
2059         REGEX_CHECK_STATUS;
2060         REGEX_ASSERT(result == &destText);
2061         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2062
2063         result = matcher->group(2, NULL, status);
2064         REGEX_CHECK_STATUS;
2065         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2066         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067         utext_close(result);
2068         result = matcher->group(2, &destText, status);
2069         REGEX_CHECK_STATUS;
2070         REGEX_ASSERT(result == &destText);
2071         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2072
2073         result = matcher->group(3, NULL, status);
2074         REGEX_CHECK_STATUS;
2075         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2076         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077         utext_close(result);
2078         result = matcher->group(3, &destText, status);
2079         REGEX_CHECK_STATUS;
2080         REGEX_ASSERT(result == &destText);
2081         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2082
2083         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2084         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2085         matcher->reset();
2086         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2087
2088         delete matcher;
2089         delete pat;
2090
2091         utext_close(&destText);
2092         utext_close(&input);
2093         utext_close(&re);
2094     }
2095
2096     //
2097     //  find
2098     //
2099     {
2100         int32_t             flags=0;
2101         UParseError         pe;
2102         UErrorCode          status=U_ZERO_ERROR;
2103         UText               re=UTEXT_INITIALIZER;
2104         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2105         utext_openUTF8(&re, str_abc, -1, &status);
2106
2107         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2108         REGEX_CHECK_STATUS;
2109         UText input = UTEXT_INITIALIZER;
2110         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2111         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2112         //                      012345678901234567
2113
2114         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2115         REGEX_CHECK_STATUS;
2116         REGEX_ASSERT(matcher->find());
2117         REGEX_ASSERT(matcher->start(status) == 1);
2118         REGEX_ASSERT(matcher->find());
2119         REGEX_ASSERT(matcher->start(status) == 6);
2120         REGEX_ASSERT(matcher->find());
2121         REGEX_ASSERT(matcher->start(status) == 12);
2122         REGEX_ASSERT(matcher->find() == FALSE);
2123         REGEX_ASSERT(matcher->find() == FALSE);
2124
2125         matcher->reset();
2126         REGEX_ASSERT(matcher->find());
2127         REGEX_ASSERT(matcher->start(status) == 1);
2128
2129         REGEX_ASSERT(matcher->find(0, status));
2130         REGEX_ASSERT(matcher->start(status) == 1);
2131         REGEX_ASSERT(matcher->find(1, status));
2132         REGEX_ASSERT(matcher->start(status) == 1);
2133         REGEX_ASSERT(matcher->find(2, status));
2134         REGEX_ASSERT(matcher->start(status) == 6);
2135         REGEX_ASSERT(matcher->find(12, status));
2136         REGEX_ASSERT(matcher->start(status) == 12);
2137         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2138         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2139         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2140         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2141
2142         status = U_ZERO_ERROR;
2143         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2144         status = U_ZERO_ERROR;
2145         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2146
2147         REGEX_ASSERT(matcher->groupCount() == 0);
2148
2149         delete matcher;
2150         delete pat;
2151
2152         utext_close(&input);
2153         utext_close(&re);
2154     }
2155
2156
2157     //
2158     //  find, with \G in pattern (true if at the end of a previous match).
2159     //
2160     {
2161         int32_t             flags=0;
2162         UParseError         pe;
2163         UErrorCode          status=U_ZERO_ERROR;
2164         UText               re=UTEXT_INITIALIZER;
2165         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2166         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2167
2168         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2169
2170         REGEX_CHECK_STATUS;
2171         UText input = UTEXT_INITIALIZER;
2172         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2173         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2174         //                      012345678901234567
2175
2176         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2177         REGEX_CHECK_STATUS;
2178         REGEX_ASSERT(matcher->find());
2179         REGEX_ASSERT(matcher->start(status) == 0);
2180         REGEX_ASSERT(matcher->start(1, status) == -1);
2181         REGEX_ASSERT(matcher->start(2, status) == 1);
2182
2183         REGEX_ASSERT(matcher->find());
2184         REGEX_ASSERT(matcher->start(status) == 4);
2185         REGEX_ASSERT(matcher->start(1, status) == 4);
2186         REGEX_ASSERT(matcher->start(2, status) == -1);
2187         REGEX_CHECK_STATUS;
2188
2189         delete matcher;
2190         delete pat;
2191
2192         utext_close(&input);
2193         utext_close(&re);
2194     }
2195
2196     //
2197     //   find with zero length matches, match position should bump ahead
2198     //     to prevent loops.
2199     //
2200     {
2201         int32_t                 i;
2202         UErrorCode          status=U_ZERO_ERROR;
2203         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2204                                                       //   using an always-true look-ahead.
2205         REGEX_CHECK_STATUS;
2206         UText s = UTEXT_INITIALIZER;
2207         utext_openUTF8(&s, "    ", -1, &status);
2208         m.reset(&s);
2209         for (i=0; ; i++) {
2210             if (m.find() == FALSE) {
2211                 break;
2212             }
2213             REGEX_ASSERT(m.start(status) == i);
2214             REGEX_ASSERT(m.end(status) == i);
2215         }
2216         REGEX_ASSERT(i==5);
2217
2218         // Check that the bump goes over characters outside the BMP OK
2219         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2220         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2221         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2222         m.reset(&s);
2223         for (i=0; ; i+=4) {
2224             if (m.find() == FALSE) {
2225                 break;
2226             }
2227             REGEX_ASSERT(m.start(status) == i);
2228             REGEX_ASSERT(m.end(status) == i);
2229         }
2230         REGEX_ASSERT(i==20);
2231
2232         utext_close(&s);
2233     }
2234     {
2235         // find() loop breaking test.
2236         //        with pattern of /.?/, should see a series of one char matches, then a single
2237         //        match of zero length at the end of the input string.
2238         int32_t                 i;
2239         UErrorCode          status=U_ZERO_ERROR;
2240         RegexMatcher        m(".?", 0, status);
2241         REGEX_CHECK_STATUS;
2242         UText s = UTEXT_INITIALIZER;
2243         utext_openUTF8(&s, "    ", -1, &status);
2244         m.reset(&s);
2245         for (i=0; ; i++) {
2246             if (m.find() == FALSE) {
2247                 break;
2248             }
2249             REGEX_ASSERT(m.start(status) == i);
2250             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2251         }
2252         REGEX_ASSERT(i==5);
2253
2254         utext_close(&s);
2255     }
2256
2257
2258     //
2259     // Matchers with no input string behave as if they had an empty input string.
2260     //
2261
2262     {
2263         UErrorCode status = U_ZERO_ERROR;
2264         RegexMatcher  m(".?", 0, status);
2265         REGEX_CHECK_STATUS;
2266         REGEX_ASSERT(m.find());
2267         REGEX_ASSERT(m.start(status) == 0);
2268         REGEX_ASSERT(m.input() == "");
2269     }
2270     {
2271         UErrorCode status = U_ZERO_ERROR;
2272         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2273         RegexMatcher  *m = p->matcher(status);
2274         REGEX_CHECK_STATUS;
2275
2276         REGEX_ASSERT(m->find() == FALSE);
2277         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2278         delete m;
2279         delete p;
2280     }
2281
2282     //
2283     // Regions
2284     //
2285     {
2286         UErrorCode status = U_ZERO_ERROR;
2287         UText testPattern = UTEXT_INITIALIZER;
2288         UText testText    = UTEXT_INITIALIZER;
2289         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2290         REGEX_VERBOSE_TEXT(&testPattern);
2291         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2292         REGEX_VERBOSE_TEXT(&testText);
2293
2294         RegexMatcher m(&testPattern, &testText, 0, status);
2295         REGEX_CHECK_STATUS;
2296         REGEX_ASSERT(m.regionStart() == 0);
2297         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2298         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2299         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2300
2301         m.region(2,4, status);
2302         REGEX_CHECK_STATUS;
2303         REGEX_ASSERT(m.matches(status));
2304         REGEX_ASSERT(m.start(status)==2);
2305         REGEX_ASSERT(m.end(status)==4);
2306         REGEX_CHECK_STATUS;
2307
2308         m.reset();
2309         REGEX_ASSERT(m.regionStart() == 0);
2310         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2311
2312         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2313         REGEX_VERBOSE_TEXT(&testText);
2314         m.reset(&testText);
2315         REGEX_ASSERT(m.regionStart() == 0);
2316         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2317
2318         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2319         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2320         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2321         REGEX_ASSERT(&m == &m.reset());
2322         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2323
2324         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2325         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326         REGEX_ASSERT(&m == &m.reset());
2327         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2328
2329         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2330         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2331         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2332         REGEX_ASSERT(&m == &m.reset());
2333         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2334
2335         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2336         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2337         REGEX_ASSERT(&m == &m.reset());
2338         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2339
2340         utext_close(&testText);
2341         utext_close(&testPattern);
2342     }
2343
2344     //
2345     // hitEnd() and requireEnd()
2346     //
2347     {
2348         UErrorCode status = U_ZERO_ERROR;
2349         UText testPattern = UTEXT_INITIALIZER;
2350         UText testText    = UTEXT_INITIALIZER;
2351         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2352         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2353         utext_openUTF8(&testPattern, str_, -1, &status);
2354         utext_openUTF8(&testText, str_aabb, -1, &status);
2355
2356         RegexMatcher m1(&testPattern, &testText,  0, status);
2357         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2358         REGEX_ASSERT(m1.hitEnd() == TRUE);
2359         REGEX_ASSERT(m1.requireEnd() == FALSE);
2360         REGEX_CHECK_STATUS;
2361
2362         status = U_ZERO_ERROR;
2363         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2364         utext_openUTF8(&testPattern, str_a, -1, &status);
2365         RegexMatcher m2(&testPattern, &testText, 0, status);
2366         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2367         REGEX_ASSERT(m2.hitEnd() == FALSE);
2368         REGEX_ASSERT(m2.requireEnd() == FALSE);
2369         REGEX_CHECK_STATUS;
2370
2371         status = U_ZERO_ERROR;
2372         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2373         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2374         RegexMatcher m3(&testPattern, &testText, 0, status);
2375         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2376         REGEX_ASSERT(m3.hitEnd() == TRUE);
2377         REGEX_ASSERT(m3.requireEnd() == TRUE);
2378         REGEX_CHECK_STATUS;
2379
2380         utext_close(&testText);
2381         utext_close(&testPattern);
2382     }
2383 }
2384
2385
2386 //---------------------------------------------------------------------------
2387 //
2388 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2389 //                         Replace family of functions.
2390 //
2391 //---------------------------------------------------------------------------
2392 void RegexTest::API_Replace_UTF8() {
2393     //
2394     //  Replace
2395     //
2396     int32_t             flags=0;
2397     UParseError         pe;
2398     UErrorCode          status=U_ZERO_ERROR;
2399
2400     UText               re=UTEXT_INITIALIZER;
2401     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2402     REGEX_VERBOSE_TEXT(&re);
2403     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2404     REGEX_CHECK_STATUS;
2405
2406     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2407     //             012345678901234567
2408     UText dataText = UTEXT_INITIALIZER;
2409     utext_openUTF8(&dataText, data, -1, &status);
2410     REGEX_CHECK_STATUS;
2411     REGEX_VERBOSE_TEXT(&dataText);
2412     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2413
2414     //
2415     //  Plain vanilla matches.
2416     //
2417     UnicodeString  dest;
2418     UText destText = UTEXT_INITIALIZER;
2419     utext_openUnicodeString(&destText, &dest, &status);
2420     UText *result;
2421
2422     UText replText = UTEXT_INITIALIZER;
2423
2424     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2425     utext_openUTF8(&replText, str_yz, -1, &status);
2426     REGEX_VERBOSE_TEXT(&replText);
2427     result = matcher->replaceFirst(&replText, NULL, status);
2428     REGEX_CHECK_STATUS;
2429     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2430     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431     utext_close(result);
2432     result = matcher->replaceFirst(&replText, &destText, status);
2433     REGEX_CHECK_STATUS;
2434     REGEX_ASSERT(result == &destText);
2435     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2436
2437     result = matcher->replaceAll(&replText, NULL, status);
2438     REGEX_CHECK_STATUS;
2439     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2440     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2441     utext_close(result);
2442
2443     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2444     result = matcher->replaceAll(&replText, &destText, status);
2445     REGEX_CHECK_STATUS;
2446     REGEX_ASSERT(result == &destText);
2447     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2448
2449     //
2450     //  Plain vanilla non-matches.
2451     //
2452     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2453     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2454     matcher->reset(&dataText);
2455
2456     result = matcher->replaceFirst(&replText, NULL, status);
2457     REGEX_CHECK_STATUS;
2458     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459     utext_close(result);
2460     result = matcher->replaceFirst(&replText, &destText, status);
2461     REGEX_CHECK_STATUS;
2462     REGEX_ASSERT(result == &destText);
2463     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2464
2465     result = matcher->replaceAll(&replText, NULL, status);
2466     REGEX_CHECK_STATUS;
2467     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2468     utext_close(result);
2469     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470     result = matcher->replaceAll(&replText, &destText, status);
2471     REGEX_CHECK_STATUS;
2472     REGEX_ASSERT(result == &destText);
2473     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2474
2475     //
2476     // Empty source string
2477     //
2478     utext_openUTF8(&dataText, NULL, 0, &status);
2479     matcher->reset(&dataText);
2480
2481     result = matcher->replaceFirst(&replText, NULL, status);
2482     REGEX_CHECK_STATUS;
2483     REGEX_ASSERT_UTEXT_UTF8("", result);
2484     utext_close(result);
2485     result = matcher->replaceFirst(&replText, &destText, status);
2486     REGEX_CHECK_STATUS;
2487     REGEX_ASSERT(result == &destText);
2488     REGEX_ASSERT_UTEXT_UTF8("", result);
2489
2490     result = matcher->replaceAll(&replText, NULL, status);
2491     REGEX_CHECK_STATUS;
2492     REGEX_ASSERT_UTEXT_UTF8("", result);
2493     utext_close(result);
2494     result = matcher->replaceAll(&replText, &destText, status);
2495     REGEX_CHECK_STATUS;
2496     REGEX_ASSERT(result == &destText);
2497     REGEX_ASSERT_UTEXT_UTF8("", result);
2498
2499     //
2500     // Empty substitution string
2501     //
2502     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2503     matcher->reset(&dataText);
2504
2505     utext_openUTF8(&replText, NULL, 0, &status);
2506     result = matcher->replaceFirst(&replText, NULL, status);
2507     REGEX_CHECK_STATUS;
2508     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2509     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510     utext_close(result);
2511     result = matcher->replaceFirst(&replText, &destText, status);
2512     REGEX_CHECK_STATUS;
2513     REGEX_ASSERT(result == &destText);
2514     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2515
2516     result = matcher->replaceAll(&replText, NULL, status);
2517     REGEX_CHECK_STATUS;
2518     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2519     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2520     utext_close(result);
2521     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522     result = matcher->replaceAll(&replText, &destText, status);
2523     REGEX_CHECK_STATUS;
2524     REGEX_ASSERT(result == &destText);
2525     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2526
2527     //
2528     // match whole string
2529     //
2530     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2531     utext_openUTF8(&dataText, str_abc, -1, &status);
2532     matcher->reset(&dataText);
2533
2534     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2535     utext_openUTF8(&replText, str_xyz, -1, &status);
2536     result = matcher->replaceFirst(&replText, NULL, status);
2537     REGEX_CHECK_STATUS;
2538     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539     utext_close(result);
2540     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2541     result = matcher->replaceFirst(&replText, &destText, status);
2542     REGEX_CHECK_STATUS;
2543     REGEX_ASSERT(result == &destText);
2544     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2545
2546     result = matcher->replaceAll(&replText, NULL, status);
2547     REGEX_CHECK_STATUS;
2548     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2549     utext_close(result);
2550     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2551     result = matcher->replaceAll(&replText, &destText, status);
2552     REGEX_CHECK_STATUS;
2553     REGEX_ASSERT(result == &destText);
2554     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2555
2556     //
2557     // Capture Group, simple case
2558     //
2559     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2560     utext_openUTF8(&re, str_add, -1, &status);
2561     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2562     REGEX_CHECK_STATUS;
2563
2564     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2565     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2566     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2567     REGEX_CHECK_STATUS;
2568
2569     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2570     utext_openUTF8(&replText, str_11, -1, &status);
2571     result = matcher2->replaceFirst(&replText, NULL, status);
2572     REGEX_CHECK_STATUS;
2573     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2574     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2575     utext_close(result);
2576     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577     result = matcher2->replaceFirst(&replText, &destText, status);
2578     REGEX_CHECK_STATUS;
2579     REGEX_ASSERT(result == &destText);
2580     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2581
2582     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2583     utext_openUTF8(&replText, str_v, -1, &status);
2584     REGEX_VERBOSE_TEXT(&replText);
2585     result = matcher2->replaceFirst(&replText, NULL, status);
2586     REGEX_CHECK_STATUS;
2587     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2588     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2589     utext_close(result);
2590     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2591     result = matcher2->replaceFirst(&replText, &destText, status);
2592     REGEX_CHECK_STATUS;
2593     REGEX_ASSERT(result == &destText);
2594     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2595
2596     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2597     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2598     result = matcher2->replaceFirst(&replText, NULL, status);
2599     REGEX_CHECK_STATUS;
2600     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2601     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2602     utext_close(result);
2603     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2604     result = matcher2->replaceFirst(&replText, &destText, status);
2605     REGEX_CHECK_STATUS;
2606     REGEX_ASSERT(result == &destText);
2607     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2608
2609     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2610     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2611     //                                 012345678901234567890123456
2612     supplDigitChars[22] = 0xF0;
2613     supplDigitChars[23] = 0x9D;
2614     supplDigitChars[24] = 0x9F;
2615     supplDigitChars[25] = 0x8F;
2616     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2617
2618     result = matcher2->replaceFirst(&replText, NULL, status);
2619     REGEX_CHECK_STATUS;
2620     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2621     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2622     utext_close(result);
2623     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2624     result = matcher2->replaceFirst(&replText, &destText, status);
2625     REGEX_CHECK_STATUS;
2626     REGEX_ASSERT(result == &destText);
2627     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2628     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2629     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2630     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632     utext_close(result);
2633     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2634     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2635     REGEX_ASSERT(result == &destText);
2636 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2637
2638     //
2639     // Replacement String with \u hex escapes
2640     //
2641     {
2642       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2643       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2644         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2645         utext_openUTF8(&replText, str_u0043, -1, &status);
2646         matcher->reset(&dataText);
2647
2648         result = matcher->replaceAll(&replText, NULL, status);
2649         REGEX_CHECK_STATUS;
2650         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2651         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2652         utext_close(result);
2653         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2654         result = matcher->replaceAll(&replText, &destText, status);
2655         REGEX_CHECK_STATUS;
2656         REGEX_ASSERT(result == &destText);
2657         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2658     }
2659     {
2660       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2661         utext_openUTF8(&dataText, str_abc, -1, &status);
2662         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2663         utext_openUTF8(&replText, str_U00010000, -1, &status);
2664         matcher->reset(&dataText);
2665
2666         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2667         //                          0123456789
2668         expected[2] = 0xF0;
2669         expected[3] = 0x90;
2670         expected[4] = 0x80;
2671         expected[5] = 0x80;
2672
2673         result = matcher->replaceAll(&replText, NULL, status);
2674         REGEX_CHECK_STATUS;
2675         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2676         utext_close(result);
2677         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678         result = matcher->replaceAll(&replText, &destText, status);
2679         REGEX_CHECK_STATUS;
2680         REGEX_ASSERT(result == &destText);
2681         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2682     }
2683     // TODO:  need more through testing of capture substitutions.
2684
2685     // Bug 4057
2686     //
2687     {
2688         status = U_ZERO_ERROR;
2689 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2690 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2691 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2692         utext_openUTF8(&re, str_ssee, -1, &status);
2693         utext_openUTF8(&dataText, str_blah, -1, &status);
2694         utext_openUTF8(&replText, str_ooh, -1, &status);
2695
2696         RegexMatcher m(&re, 0, status);
2697         REGEX_CHECK_STATUS;
2698
2699         UnicodeString result;
2700         UText resultText = UTEXT_INITIALIZER;
2701         utext_openUnicodeString(&resultText, &result, &status);
2702
2703         // Multiple finds do NOT bump up the previous appendReplacement postion.
2704         m.reset(&dataText);
2705         m.find();
2706         m.find();
2707         m.appendReplacement(&resultText, &replText, status);
2708         REGEX_CHECK_STATUS;
2709         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2710         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2711
2712         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2713         status = U_ZERO_ERROR;
2714         result.truncate(0);
2715         utext_openUnicodeString(&resultText, &result, &status);
2716         m.reset(10, status);
2717         m.find();
2718         m.find();
2719         m.appendReplacement(&resultText, &replText, status);
2720         REGEX_CHECK_STATUS;
2721         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2722         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2723
2724         // find() at interior of string, appendReplacement still starts at beginning.
2725         status = U_ZERO_ERROR;
2726         result.truncate(0);
2727         utext_openUnicodeString(&resultText, &result, &status);
2728         m.reset();
2729         m.find(10, status);
2730         m.find();
2731         m.appendReplacement(&resultText, &replText, status);
2732         REGEX_CHECK_STATUS;
2733         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2734         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2735
2736         m.appendTail(&resultText, status);
2737         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2738         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2739
2740         utext_close(&resultText);
2741     }
2742
2743     delete matcher2;
2744     delete pat2;
2745     delete matcher;
2746     delete pat;
2747
2748     utext_close(&dataText);
2749     utext_close(&replText);
2750     utext_close(&destText);
2751     utext_close(&re);
2752 }
2753
2754
2755 //---------------------------------------------------------------------------
2756 //
2757 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2758 //                        present and nominally working.
2759 //
2760 //---------------------------------------------------------------------------
2761 void RegexTest::API_Pattern_UTF8() {
2762     RegexPattern        pata;    // Test default constructor to not crash.
2763     RegexPattern        patb;
2764
2765     REGEX_ASSERT(pata == patb);
2766     REGEX_ASSERT(pata == pata);
2767
2768     UText         re1 = UTEXT_INITIALIZER;
2769     UText         re2 = UTEXT_INITIALIZER;
2770     UErrorCode    status = U_ZERO_ERROR;
2771     UParseError   pe;
2772
2773     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2774     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2775     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2776     utext_openUTF8(&re2, str_def, -1, &status);
2777
2778     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2779     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2780     REGEX_CHECK_STATUS;
2781     REGEX_ASSERT(*pat1 == *pat1);
2782     REGEX_ASSERT(*pat1 != pata);
2783
2784     // Assign
2785     patb = *pat1;
2786     REGEX_ASSERT(patb == *pat1);
2787
2788     // Copy Construct
2789     RegexPattern patc(*pat1);
2790     REGEX_ASSERT(patc == *pat1);
2791     REGEX_ASSERT(patb == patc);
2792     REGEX_ASSERT(pat1 != pat2);
2793     patb = *pat2;
2794     REGEX_ASSERT(patb != patc);
2795     REGEX_ASSERT(patb == *pat2);
2796
2797     // Compile with no flags.
2798     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2799     REGEX_ASSERT(*pat1a == *pat1);
2800
2801     REGEX_ASSERT(pat1a->flags() == 0);
2802
2803     // Compile with different flags should be not equal
2804     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2805     REGEX_CHECK_STATUS;
2806
2807     REGEX_ASSERT(*pat1b != *pat1a);
2808     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2809     REGEX_ASSERT(pat1a->flags() == 0);
2810     delete pat1b;
2811
2812     // clone
2813     RegexPattern *pat1c = pat1->clone();
2814     REGEX_ASSERT(*pat1c == *pat1);
2815     REGEX_ASSERT(*pat1c != *pat2);
2816
2817     delete pat1c;
2818     delete pat1a;
2819     delete pat1;
2820     delete pat2;
2821
2822     utext_close(&re1);
2823     utext_close(&re2);
2824
2825
2826     //
2827     //   Verify that a matcher created from a cloned pattern works.
2828     //     (Jitterbug 3423)
2829     //
2830     {
2831         UErrorCode     status     = U_ZERO_ERROR;
2832         UText          pattern    = UTEXT_INITIALIZER;
2833         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2834         utext_openUTF8(&pattern, str_pL, -1, &status);
2835
2836         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2837         RegexPattern  *pClone     = pSource->clone();
2838         delete         pSource;
2839         RegexMatcher  *mFromClone = pClone->matcher(status);
2840         REGEX_CHECK_STATUS;
2841
2842         UText          input      = UTEXT_INITIALIZER;
2843         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2844         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2845         mFromClone->reset(&input);
2846         REGEX_ASSERT(mFromClone->find() == TRUE);
2847         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2848         REGEX_ASSERT(mFromClone->find() == TRUE);
2849         REGEX_ASSERT(mFromClone->group(status) == "World");
2850         REGEX_ASSERT(mFromClone->find() == FALSE);
2851         delete mFromClone;
2852         delete pClone;
2853
2854         utext_close(&input);
2855         utext_close(&pattern);
2856     }
2857
2858     //
2859     //   matches convenience API
2860     //
2861     {
2862         UErrorCode status  = U_ZERO_ERROR;
2863         UText      pattern = UTEXT_INITIALIZER;
2864         UText      input   = UTEXT_INITIALIZER;
2865
2866         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2867         utext_openUTF8(&input, str_randominput, -1, &status);
2868
2869         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2870         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2871         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2872         REGEX_CHECK_STATUS;
2873
2874         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2875         utext_openUTF8(&pattern, str_abc, -1, &status);
2876         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2877         REGEX_CHECK_STATUS;
2878
2879         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2880         utext_openUTF8(&pattern, str_nput, -1, &status);
2881         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2882         REGEX_CHECK_STATUS;
2883
2884         utext_openUTF8(&pattern, str_randominput, -1, &status);
2885         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2886         REGEX_CHECK_STATUS;
2887
2888         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2889         utext_openUTF8(&pattern, str_u, -1, &status);
2890         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2891         REGEX_CHECK_STATUS;
2892
2893         utext_openUTF8(&input, str_abc, -1, &status);
2894         utext_openUTF8(&pattern, str_abc, -1, &status);
2895         status = U_INDEX_OUTOFBOUNDS_ERROR;
2896         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2897         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2898
2899         utext_close(&input);
2900         utext_close(&pattern);
2901     }
2902
2903
2904     //
2905     // Split()
2906     //
2907     status = U_ZERO_ERROR;
2908     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2909     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2910     pat1 = RegexPattern::compile(&re1, pe, status);
2911     REGEX_CHECK_STATUS;
2912     UnicodeString  fields[10];
2913
2914     int32_t n;
2915     n = pat1->split("Now is the time", fields, 10, status);
2916     REGEX_CHECK_STATUS;
2917     REGEX_ASSERT(n==4);
2918     REGEX_ASSERT(fields[0]=="Now");
2919     REGEX_ASSERT(fields[1]=="is");
2920     REGEX_ASSERT(fields[2]=="the");
2921     REGEX_ASSERT(fields[3]=="time");
2922     REGEX_ASSERT(fields[4]=="");
2923
2924     n = pat1->split("Now is the time", fields, 2, status);
2925     REGEX_CHECK_STATUS;
2926     REGEX_ASSERT(n==2);
2927     REGEX_ASSERT(fields[0]=="Now");
2928     REGEX_ASSERT(fields[1]=="is the time");
2929     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2930
2931     fields[1] = "*";
2932     status = U_ZERO_ERROR;
2933     n = pat1->split("Now is the time", fields, 1, status);
2934     REGEX_CHECK_STATUS;
2935     REGEX_ASSERT(n==1);
2936     REGEX_ASSERT(fields[0]=="Now is the time");
2937     REGEX_ASSERT(fields[1]=="*");
2938     status = U_ZERO_ERROR;
2939
2940     n = pat1->split("    Now       is the time   ", fields, 10, status);
2941     REGEX_CHECK_STATUS;
2942     REGEX_ASSERT(n==6);
2943     REGEX_ASSERT(fields[0]=="");
2944     REGEX_ASSERT(fields[1]=="Now");
2945     REGEX_ASSERT(fields[2]=="is");
2946     REGEX_ASSERT(fields[3]=="the");
2947     REGEX_ASSERT(fields[4]=="time");
2948     REGEX_ASSERT(fields[5]=="");
2949     REGEX_ASSERT(fields[6]=="");
2950
2951     fields[2] = "*";
2952     n = pat1->split("     ", fields, 10, status);
2953     REGEX_CHECK_STATUS;
2954     REGEX_ASSERT(n==2);
2955     REGEX_ASSERT(fields[0]=="");
2956     REGEX_ASSERT(fields[1]=="");
2957     REGEX_ASSERT(fields[2]=="*");
2958
2959     fields[0] = "foo";
2960     n = pat1->split("", fields, 10, status);
2961     REGEX_CHECK_STATUS;
2962     REGEX_ASSERT(n==0);
2963     REGEX_ASSERT(fields[0]=="foo");
2964
2965     delete pat1;
2966
2967     //  split, with a pattern with (capture)
2968     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2969     pat1 = RegexPattern::compile(&re1,  pe, status);
2970     REGEX_CHECK_STATUS;
2971
2972     status = U_ZERO_ERROR;
2973     fields[6] = fields[7] = "*";
2974     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2975     REGEX_CHECK_STATUS;
2976     REGEX_ASSERT(n==7);
2977     REGEX_ASSERT(fields[0]=="");
2978     REGEX_ASSERT(fields[1]=="a");
2979     REGEX_ASSERT(fields[2]=="Now is ");
2980     REGEX_ASSERT(fields[3]=="b");
2981     REGEX_ASSERT(fields[4]=="the time");
2982     REGEX_ASSERT(fields[5]=="c");
2983     REGEX_ASSERT(fields[6]=="");
2984     REGEX_ASSERT(fields[7]=="*");
2985     REGEX_ASSERT(status==U_ZERO_ERROR);
2986
2987     fields[6] = fields[7] = "*";
2988     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2989     REGEX_CHECK_STATUS;
2990     REGEX_ASSERT(n==7);
2991     REGEX_ASSERT(fields[0]=="  ");
2992     REGEX_ASSERT(fields[1]=="a");
2993     REGEX_ASSERT(fields[2]=="Now is ");
2994     REGEX_ASSERT(fields[3]=="b");
2995     REGEX_ASSERT(fields[4]=="the time");
2996     REGEX_ASSERT(fields[5]=="c");
2997     REGEX_ASSERT(fields[6]=="");
2998     REGEX_ASSERT(fields[7]=="*");
2999
3000     status = U_ZERO_ERROR;
3001     fields[6] = "foo";
3002     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3003     REGEX_CHECK_STATUS;
3004     REGEX_ASSERT(n==6);
3005     REGEX_ASSERT(fields[0]=="  ");
3006     REGEX_ASSERT(fields[1]=="a");
3007     REGEX_ASSERT(fields[2]=="Now is ");
3008     REGEX_ASSERT(fields[3]=="b");
3009     REGEX_ASSERT(fields[4]=="the time");
3010     REGEX_ASSERT(fields[5]==" ");
3011     REGEX_ASSERT(fields[6]=="foo");
3012
3013     status = U_ZERO_ERROR;
3014     fields[5] = "foo";
3015     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3016     REGEX_CHECK_STATUS;
3017     REGEX_ASSERT(n==5);
3018     REGEX_ASSERT(fields[0]=="  ");
3019     REGEX_ASSERT(fields[1]=="a");
3020     REGEX_ASSERT(fields[2]=="Now is ");
3021     REGEX_ASSERT(fields[3]=="b");
3022     REGEX_ASSERT(fields[4]=="the time<c>");
3023     REGEX_ASSERT(fields[5]=="foo");
3024
3025     status = U_ZERO_ERROR;
3026     fields[5] = "foo";
3027     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3028     REGEX_CHECK_STATUS;
3029     REGEX_ASSERT(n==5);
3030     REGEX_ASSERT(fields[0]=="  ");
3031     REGEX_ASSERT(fields[1]=="a");
3032     REGEX_ASSERT(fields[2]=="Now is ");
3033     REGEX_ASSERT(fields[3]=="b");
3034     REGEX_ASSERT(fields[4]=="the time");
3035     REGEX_ASSERT(fields[5]=="foo");
3036
3037     status = U_ZERO_ERROR;
3038     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3039     REGEX_CHECK_STATUS;
3040     REGEX_ASSERT(n==4);
3041     REGEX_ASSERT(fields[0]=="  ");
3042     REGEX_ASSERT(fields[1]=="a");
3043     REGEX_ASSERT(fields[2]=="Now is ");
3044     REGEX_ASSERT(fields[3]=="the time<c>");
3045     status = U_ZERO_ERROR;
3046     delete pat1;
3047
3048     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3049     pat1 = RegexPattern::compile(&re1, pe, status);
3050     REGEX_CHECK_STATUS;
3051     n = pat1->split("1-10,20", fields, 10, status);
3052     REGEX_CHECK_STATUS;
3053     REGEX_ASSERT(n==5);
3054     REGEX_ASSERT(fields[0]=="1");
3055     REGEX_ASSERT(fields[1]=="-");
3056     REGEX_ASSERT(fields[2]=="10");
3057     REGEX_ASSERT(fields[3]==",");
3058     REGEX_ASSERT(fields[4]=="20");
3059     delete pat1;
3060
3061
3062     //
3063     // RegexPattern::pattern() and patternText()
3064     //
3065     pat1 = new RegexPattern();
3066     REGEX_ASSERT(pat1->pattern() == "");
3067     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3068     delete pat1;
3069     const char *helloWorldInvariant = "(Hello, world)*";
3070     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3071     pat1 = RegexPattern::compile(&re1, pe, status);
3072     REGEX_CHECK_STATUS;
3073     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3074     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3075     delete pat1;
3076
3077     utext_close(&re1);
3078 }
3079
3080
3081 //---------------------------------------------------------------------------
3082 //
3083 //      Extended       A more thorough check for features of regex patterns
3084 //                     The test cases are in a separate data file,
3085 //                       source/tests/testdata/regextst.txt
3086 //                     A description of the test data format is included in that file.
3087 //
3088 //---------------------------------------------------------------------------
3089
3090 const char *
3091 RegexTest::getPath(char buffer[2048], const char *filename) {
3092     UErrorCode status=U_ZERO_ERROR;
3093     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3094     if (U_FAILURE(status)) {
3095         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3096         return NULL;
3097     }
3098
3099     strcpy(buffer, testDataDirectory);
3100     strcat(buffer, filename);
3101     return buffer;
3102 }
3103
3104 void RegexTest::Extended() {
3105     char tdd[2048];
3106     const char *srcPath;
3107     UErrorCode  status  = U_ZERO_ERROR;
3108     int32_t     lineNum = 0;
3109
3110     //
3111     //  Open and read the test data file.
3112     //
3113     srcPath=getPath(tdd, "regextst.txt");
3114     if(srcPath==NULL) {
3115         return; /* something went wrong, error already output */
3116     }
3117
3118     int32_t    len;
3119     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3120     if (U_FAILURE(status)) {
3121         return; /* something went wrong, error already output */
3122     }
3123
3124     //
3125     //  Put the test data into a UnicodeString
3126     //
3127     UnicodeString testString(FALSE, testData, len);
3128
3129     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3130     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3131     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3132
3133     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3134     UnicodeString   testPattern;   // The pattern for test from the test file.
3135     UnicodeString   testFlags;     // the flags   for a test.
3136     UnicodeString   matchString;   // The marked up string to be used as input
3137
3138     if (U_FAILURE(status)){
3139         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3140         delete [] testData;
3141         return;
3142     }
3143
3144     //
3145     //  Loop over the test data file, once per line.
3146     //
3147     while (lineMat.find()) {
3148         lineNum++;
3149         if (U_FAILURE(status)) {
3150           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3151         }
3152
3153         status = U_ZERO_ERROR;
3154         UnicodeString testLine = lineMat.group(1, status);
3155         if (testLine.length() == 0) {
3156             continue;
3157         }
3158
3159         //
3160         // Parse the test line.  Skip blank and comment only lines.
3161         // Separate out the three main fields - pattern, flags, target.
3162         //
3163
3164         commentMat.reset(testLine);
3165         if (commentMat.lookingAt(status)) {
3166             // This line is a comment, or blank.
3167             continue;
3168         }
3169
3170         //
3171         //  Pull out the pattern field, remove it from the test file line.
3172         //
3173         quotedStuffMat.reset(testLine);
3174         if (quotedStuffMat.lookingAt(status)) {
3175             testPattern = quotedStuffMat.group(2, status);
3176             testLine.remove(0, quotedStuffMat.end(0, status));
3177         } else {
3178             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3179             continue;
3180         }
3181
3182
3183         //
3184         //  Pull out the flags from the test file line.
3185         //
3186         flagsMat.reset(testLine);
3187         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3188         testFlags = flagsMat.group(1, status);
3189         if (flagsMat.group(2, status).length() > 0) {
3190             errln("Bad Match flag at line %d. Scanning %c\n",
3191                 lineNum, flagsMat.group(2, status).charAt(0));
3192             continue;
3193         }
3194         testLine.remove(0, flagsMat.end(0, status));
3195
3196         //
3197         //  Pull out the match string, as a whole.
3198         //    We'll process the <tags> later.
3199         //
3200         quotedStuffMat.reset(testLine);
3201         if (quotedStuffMat.lookingAt(status)) {
3202             matchString = quotedStuffMat.group(2, status);
3203             testLine.remove(0, quotedStuffMat.end(0, status));
3204         } else {
3205             errln("Bad match string at test file line %d", lineNum);
3206             continue;
3207         }
3208
3209         //
3210         //  The only thing left from the input line should be an optional trailing comment.
3211         //
3212         commentMat.reset(testLine);
3213         if (commentMat.lookingAt(status) == FALSE) {
3214             errln("Line %d: unexpected characters at end of test line.", lineNum);
3215             continue;
3216         }
3217
3218         //
3219         //  Run the test
3220         //
3221         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3222     }
3223
3224     delete [] testData;
3225
3226 }
3227
3228
3229
3230 //---------------------------------------------------------------------------
3231 //
3232 //    regex_find(pattern, flags, inputString, lineNumber)
3233 //
3234 //         Function to run a single test from the Extended (data driven) tests.
3235 //         See file test/testdata/regextst.txt for a description of the
3236 //         pattern and inputString fields, and the allowed flags.
3237 //         lineNumber is the source line in regextst.txt of the test.
3238 //
3239 //---------------------------------------------------------------------------
3240
3241
3242 //  Set a value into a UVector at position specified by a decimal number in
3243 //   a UnicodeString.   This is a utility function needed by the actual test function,
3244 //   which follows.
3245 static void set(UVector &vec, int32_t val, UnicodeString index) {
3246     UErrorCode  status=U_ZERO_ERROR;
3247     int32_t  idx = 0;
3248     for (int32_t i=0; i<index.length(); i++) {
3249         int32_t d=u_charDigitValue(index.charAt(i));
3250         if (d<0) {return;}
3251         idx = idx*10 + d;
3252     }
3253     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3254     vec.setElementAt(val, idx);
3255 }
3256
3257 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3258     UErrorCode  status=U_ZERO_ERROR;
3259     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3260     vec.setElementAt(val, idx);
3261 }
3262
3263 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3264 {
3265     UBool couldFind = TRUE;
3266     UTEXT_SETNATIVEINDEX(utext, 0);
3267     int32_t i = 0;
3268     while (i < unistrOffset) {
3269         UChar32 c = UTEXT_NEXT32(utext);
3270         if (c != U_SENTINEL) {
3271             i += U16_LENGTH(c);
3272         } else {
3273             couldFind = FALSE;
3274             break;
3275         }
3276     }
3277     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3278     return couldFind;
3279 }
3280
3281
3282 void RegexTest::regex_find(const UnicodeString &pattern,
3283                            const UnicodeString &flags,
3284                            const UnicodeString &inputString,
3285                            const char *srcPath,
3286                            int32_t line) {
3287     UnicodeString       unEscapedInput;
3288     UnicodeString       deTaggedInput;
3289
3290     int32_t             patternUTF8Length,      inputUTF8Length;
3291     char                *patternChars  = NULL, *inputChars = NULL;
3292     UText               patternText    = UTEXT_INITIALIZER;
3293     UText               inputText      = UTEXT_INITIALIZER;
3294     UConverter          *UTF8Converter = NULL;
3295
3296     UErrorCode          status         = U_ZERO_ERROR;
3297     UParseError         pe;
3298     RegexPattern        *parsePat      = NULL;
3299     RegexMatcher        *parseMatcher  = NULL;
3300     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3301     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3302     UVector             groupStarts(status);
3303     UVector             groupEnds(status);
3304     UVector             groupStartsUTF8(status);
3305     UVector             groupEndsUTF8(status);
3306     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3307     UBool               failed         = FALSE;
3308     int32_t             numFinds;
3309     int32_t             i;
3310     UBool               useMatchesFunc   = FALSE;
3311     UBool               useLookingAtFunc = FALSE;
3312     int32_t             regionStart      = -1;
3313     int32_t             regionEnd        = -1;
3314     int32_t             regionStartUTF8  = -1;
3315     int32_t             regionEndUTF8    = -1;
3316
3317
3318     //
3319     //  Compile the caller's pattern
3320     //
3321     uint32_t bflags = 0;
3322     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3323         bflags |= UREGEX_CASE_INSENSITIVE;
3324     }
3325     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3326         bflags |= UREGEX_COMMENTS;
3327     }
3328     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3329         bflags |= UREGEX_DOTALL;
3330     }
3331     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3332         bflags |= UREGEX_MULTILINE;
3333     }
3334
3335     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3336         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3337     }
3338     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3339         bflags |= UREGEX_UNIX_LINES;
3340     }
3341     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3342         bflags |= UREGEX_LITERAL;
3343     }
3344
3345
3346     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3347     if (status != U_ZERO_ERROR) {
3348         #if UCONFIG_NO_BREAK_ITERATION==1
3349         // 'v' test flag means that the test pattern should not compile if ICU was configured
3350         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3351         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3352             goto cleanupAndReturn;
3353         }
3354         #endif
3355         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3356             // Expected pattern compilation error.
3357             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3358                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3359             }
3360             goto cleanupAndReturn;
3361         } else {
3362             // Unexpected pattern compilation error.
3363             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3364             goto cleanupAndReturn;
3365         }
3366     }
3367
3368     UTF8Converter = ucnv_open("UTF8", &status);
3369     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3370
3371     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3372     status = U_ZERO_ERROR; // buffer overflow
3373     patternChars = new char[patternUTF8Length+1];
3374     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3375     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3376
3377     if (status == U_ZERO_ERROR) {
3378         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3379
3380         if (status != U_ZERO_ERROR) {
3381 #if UCONFIG_NO_BREAK_ITERATION==1
3382             // 'v' test flag means that the test pattern should not compile if ICU was configured
3383             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3384             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3385                 goto cleanupAndReturn;
3386             }
3387 #endif
3388             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3389                 // Expected pattern compilation error.
3390                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3391                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3392                 }
3393                 goto cleanupAndReturn;
3394             } else {
3395                 // Unexpected pattern compilation error.
3396                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3397                 goto cleanupAndReturn;
3398             }
3399         }
3400     }
3401
3402     if (UTF8Pattern == NULL) {
3403         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3404         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3405         status = U_ZERO_ERROR;
3406     }
3407
3408     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3409         callerPattern->dumpPattern();
3410     }
3411
3412     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3413         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3414         goto cleanupAndReturn;
3415     }
3416
3417
3418     //
3419     // Number of times find() should be called on the test string, default to 1
3420     //
3421     numFinds = 1;
3422     for (i=2; i<=9; i++) {
3423         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3424             if (numFinds != 1) {
3425                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3426                 goto cleanupAndReturn;
3427             }
3428             numFinds = i;
3429         }
3430     }
3431
3432     // 'M' flag.  Use matches() instead of find()
3433     if (flags.indexOf((UChar)0x4d) >= 0) {
3434         useMatchesFunc = TRUE;
3435     }
3436     if (flags.indexOf((UChar)0x4c) >= 0) {
3437         useLookingAtFunc = TRUE;
3438     }
3439
3440     //
3441     //  Find the tags in the input data, remove them, and record the group boundary
3442     //    positions.
3443     //
3444     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3445     REGEX_CHECK_STATUS_L(line);
3446
3447     unEscapedInput = inputString.unescape();
3448     parseMatcher = parsePat->matcher(unEscapedInput, status);
3449     REGEX_CHECK_STATUS_L(line);
3450     while(parseMatcher->find()) {
3451         parseMatcher->appendReplacement(deTaggedInput, "", status);
3452         REGEX_CHECK_STATUS;
3453         UnicodeString groupNum = parseMatcher->group(2, status);
3454         if (groupNum == "r") {
3455             // <r> or </r>, a region specification within the string
3456             if (parseMatcher->group(1, status) == "/") {
3457                 regionEnd = deTaggedInput.length();
3458             } else {
3459                 regionStart = deTaggedInput.length();
3460             }
3461         } else {
3462             // <digits> or </digits>, a group match boundary tag.
3463             if (parseMatcher->group(1, status) == "/") {
3464                 set(groupEnds, deTaggedInput.length(), groupNum);
3465             } else {
3466                 set(groupStarts, deTaggedInput.length(), groupNum);
3467             }
3468         }
3469     }
3470     parseMatcher->appendTail(deTaggedInput);
3471     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3472     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3473       errln("mismatched <r> tags");
3474       failed = TRUE;
3475       goto cleanupAndReturn;
3476     }
3477
3478     //
3479     //  Configure the matcher according to the flags specified with this test.
3480     //
3481     matcher = callerPattern->matcher(deTaggedInput, status);
3482     REGEX_CHECK_STATUS_L(line);
3483     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3484         matcher->setTrace(TRUE);
3485     }
3486
3487     if (UTF8Pattern != NULL) {
3488         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3489         status = U_ZERO_ERROR; // buffer overflow
3490         inputChars = new char[inputUTF8Length+1];
3491         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3492         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3493
3494         if (status == U_ZERO_ERROR) {
3495             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3496             REGEX_CHECK_STATUS_L(line);
3497         }
3498
3499         if (UTF8Matcher == NULL) {
3500             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3501           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3502             status = U_ZERO_ERROR;
3503         }
3504     }
3505
3506     //
3507     //  Generate native indices for UTF8 versions of region and capture group info
3508     //
3509     if (UTF8Matcher != NULL) {
3510         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3511         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3512
3513         //  Fill out the native index UVector info.
3514         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3515         for (i=0; i<groupStarts.size(); i++) {
3516             int32_t  start = groupStarts.elementAti(i);
3517             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3518             if (start >= 0) {
3519                 int32_t  startUTF8;
3520                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3521                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3522                     failed = TRUE;
3523                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3524                 }
3525                 setInt(groupStartsUTF8, startUTF8, i);
3526             }
3527
3528             int32_t  end = groupEnds.elementAti(i);
3529             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3530             if (end >= 0) {
3531                 int32_t  endUTF8;
3532                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3533                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3534                     failed = TRUE;
3535                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3536                 }
3537                 setInt(groupEndsUTF8, endUTF8, i);
3538             }
3539         }
3540     }
3541
3542     if (regionStart>=0) {
3543        matcher->region(regionStart, regionEnd, status);
3544        REGEX_CHECK_STATUS_L(line);
3545        if (UTF8Matcher != NULL) {
3546            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3547            REGEX_CHECK_STATUS_L(line);
3548        }
3549     }
3550     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3551         matcher->useAnchoringBounds(FALSE);
3552         if (UTF8Matcher != NULL) {
3553             UTF8Matcher->useAnchoringBounds(FALSE);
3554         }
3555     }
3556     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3557         matcher->useTransparentBounds(TRUE);
3558         if (UTF8Matcher != NULL) {
3559             UTF8Matcher->useTransparentBounds(TRUE);
3560         }
3561     }
3562
3563
3564
3565     //
3566     // Do a find on the de-tagged input using the caller's pattern
3567     //     TODO: error on count>1 and not find().
3568     //           error on both matches() and lookingAt().
3569     //
3570     for (i=0; i<numFinds; i++) {
3571         if (useMatchesFunc) {
3572             isMatch = matcher->matches(status);
3573             if (UTF8Matcher != NULL) {
3574                isUTF8Match = UTF8Matcher->matches(status);
3575             }
3576         } else  if (useLookingAtFunc) {
3577             isMatch = matcher->lookingAt(status);
3578             if (UTF8Matcher != NULL) {
3579                 isUTF8Match = UTF8Matcher->lookingAt(status);
3580             }
3581         } else {
3582             isMatch = matcher->find();
3583             if (UTF8Matcher != NULL) {
3584                 isUTF8Match = UTF8Matcher->find();
3585             }
3586         }
3587     }
3588     matcher->setTrace(FALSE);
3589     if (U_FAILURE(status)) {
3590         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3591     }
3592
3593     //
3594     // Match up the groups from the find() with the groups from the tags
3595     //
3596
3597     // number of tags should match number of groups from find operation.
3598     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3599     //   G option in test means that capture group data is not available in the
3600     //     expected results, so the check needs to be suppressed.
3601     if (isMatch == FALSE && groupStarts.size() != 0) {
3602         dataerrln("Error at line %d:  Match expected, but none found.", line);
3603         failed = TRUE;
3604         goto cleanupAndReturn;
3605     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3606         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3607         failed = TRUE;
3608         goto cleanupAndReturn;
3609     }
3610
3611     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3612         // Only check for match / no match.  Don't check capture groups.
3613         if (isMatch && groupStarts.size() == 0) {
3614             errln("Error at line %d:  No match expected, but one found.", line);
3615             failed = TRUE;
3616         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3617             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3618             failed = TRUE;
3619         }
3620         goto cleanupAndReturn;
3621     }
3622
3623     REGEX_CHECK_STATUS_L(line);
3624     for (i=0; i<=matcher->groupCount(); i++) {
3625         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3626         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3627         if (matcher->start(i, status) != expectedStart) {
3628             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3629                 line, i, expectedStart, matcher->start(i, status));
3630             failed = TRUE;
3631             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3632         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3633             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3634                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3635             failed = TRUE;
3636             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3637         }
3638
3639         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3640         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3641         if (matcher->end(i, status) != expectedEnd) {
3642             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3643                 line, i, expectedEnd, matcher->end(i, status));
3644             failed = TRUE;
3645             // Error on end position;  keep going; real error is probably yet to come as group
3646             //   end positions work from end of the input data towards the front.
3647         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3648             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3649                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3650             failed = TRUE;
3651             // Error on end position;  keep going; real error is probably yet to come as group
3652             //   end positions work from end of the input data towards the front.
3653         }
3654     }
3655     if ( matcher->groupCount()+1 < groupStarts.size()) {
3656         errln("Error at line %d: Expected %d capture groups, found %d.",
3657             line, groupStarts.size()-1, matcher->groupCount());
3658         failed = TRUE;
3659         }
3660     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3661         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3662               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3663         failed = TRUE;
3664     }
3665
3666     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3667         matcher->requireEnd() == TRUE) {
3668         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3669         failed = TRUE;
3670     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3671         UTF8Matcher->requireEnd() == TRUE) {
3672         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3673         failed = TRUE;
3674     }
3675
3676     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3677         matcher->requireEnd() == FALSE) {
3678         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3679         failed = TRUE;
3680     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3681         UTF8Matcher->requireEnd() == FALSE) {
3682         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3683         failed = TRUE;
3684     }
3685
3686     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3687         matcher->hitEnd() == TRUE) {
3688         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3689         failed = TRUE;
3690     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3691                UTF8Matcher->hitEnd() == TRUE) {
3692         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3693         failed = TRUE;
3694     }
3695
3696     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3697         matcher->hitEnd() == FALSE) {
3698         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3699         failed = TRUE;
3700     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3701                UTF8Matcher->hitEnd() == FALSE) {
3702         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3703         failed = TRUE;
3704     }
3705
3706
3707 cleanupAndReturn:
3708     if (failed) {
3709         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3710             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3711         // callerPattern->dump();
3712     }
3713     delete parseMatcher;
3714     delete parsePat;
3715     delete UTF8Matcher;
3716     delete UTF8Pattern;
3717     delete matcher;
3718     delete callerPattern;
3719
3720     utext_close(&inputText);
3721     delete[] inputChars;
3722     utext_close(&patternText);
3723     delete[] patternChars;
3724     ucnv_close(UTF8Converter);
3725 }
3726
3727
3728
3729
3730 //---------------------------------------------------------------------------
3731 //
3732 //      Errors     Check for error handling in patterns.
3733 //
3734 //---------------------------------------------------------------------------
3735 void RegexTest::Errors() {
3736     // \escape sequences that aren't implemented yet.
3737     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3738
3739     // Missing close parentheses
3740     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3741     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3742     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3743
3744     // Extra close paren
3745     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3746     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3747     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3748
3749     // Look-ahead, Look-behind
3750     //  TODO:  add tests for unbounded length look-behinds.
3751     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3752
3753     // Attempt to use non-default flags
3754     {
3755         UParseError   pe;
3756         UErrorCode    status = U_ZERO_ERROR;
3757         int32_t       flags  = UREGEX_CANON_EQ |
3758                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3759                                UREGEX_MULTILINE;
3760         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3761         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3762         delete pat1;
3763     }
3764
3765
3766     // Quantifiers are allowed only after something that can be quantified.
3767     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3768     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3769     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3770
3771     // Mal-formed {min,max} quantifiers
3772     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3773     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3774     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3775     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3776     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3777     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3778     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3779     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3780     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3781
3782     // Ticket 5389
3783     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3784
3785     // Invalid Back Reference \0
3786     //    For ICU 3.8 and earlier
3787     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3788     //
3789     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3790
3791 }
3792
3793
3794 //-------------------------------------------------------------------------------
3795 //
3796 //  Read a text data file, convert it to UChars, and return the data
3797 //    in one big UChar * buffer, which the caller must delete.
3798 //
3799 //--------------------------------------------------------------------------------
3800 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3801                                      const char *defEncoding, UErrorCode &status) {
3802     UChar       *retPtr  = NULL;
3803     char        *fileBuf = NULL;
3804     UConverter* conv     = NULL;
3805     FILE        *f       = NULL;
3806
3807     ulen = 0;
3808     if (U_FAILURE(status)) {
3809         return retPtr;
3810     }
3811
3812     //
3813     //  Open the file.
3814     //
3815     f = fopen(fileName, "rb");
3816     if (f == 0) {
3817         dataerrln("Error opening test data file %s\n", fileName);
3818         status = U_FILE_ACCESS_ERROR;
3819         return NULL;
3820     }
3821     //
3822     //  Read it in
3823     //
3824     int32_t            fileSize;
3825     int32_t            amt_read;
3826
3827     fseek( f, 0, SEEK_END);
3828     fileSize = ftell(f);
3829     fileBuf = new char[fileSize];
3830     fseek(f, 0, SEEK_SET);
3831     amt_read = fread(fileBuf, 1, fileSize, f);
3832     if (amt_read != fileSize || fileSize <= 0) {
3833         errln("Error reading test data file.");
3834         goto cleanUpAndReturn;
3835     }
3836
3837     //
3838     // Look for a Unicode Signature (BOM) on the data just read
3839     //
3840     int32_t        signatureLength;
3841     const char *   fileBufC;
3842     const char*    encoding;
3843
3844     fileBufC = fileBuf;
3845     encoding = ucnv_detectUnicodeSignature(
3846         fileBuf, fileSize, &signatureLength, &status);
3847     if(encoding!=NULL ){
3848         fileBufC  += signatureLength;
3849         fileSize  -= signatureLength;
3850     } else {
3851         encoding = defEncoding;
3852         if (strcmp(encoding, "utf-8") == 0) {
3853             errln("file %s is missing its BOM", fileName);
3854         }
3855     }
3856
3857     //
3858     // Open a converter to take the rule file to UTF-16
3859     //
3860     conv = ucnv_open(encoding, &status);
3861     if (U_FAILURE(status)) {
3862         goto cleanUpAndReturn;
3863     }
3864
3865     //
3866     // Convert the rules to UChar.
3867     //  Preflight first to determine required buffer size.
3868     //
3869     ulen = ucnv_toUChars(conv,
3870         NULL,           //  dest,
3871         0,              //  destCapacity,
3872         fileBufC,
3873         fileSize,
3874         &status);
3875     if (status == U_BUFFER_OVERFLOW_ERROR) {
3876         // Buffer Overflow is expected from the preflight operation.
3877         status = U_ZERO_ERROR;
3878
3879         retPtr = new UChar[ulen+1];
3880         ucnv_toUChars(conv,
3881             retPtr,       //  dest,
3882             ulen+1,
3883             fileBufC,
3884             fileSize,
3885             &status);
3886     }
3887
3888 cleanUpAndReturn:
3889     fclose(f);
3890     delete[] fileBuf;
3891     ucnv_close(conv);
3892     if (U_FAILURE(status)) {
3893         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3894         delete []retPtr;
3895         retPtr = 0;
3896         ulen   = 0;
3897     };
3898     return retPtr;
3899 }
3900
3901
3902 //-------------------------------------------------------------------------------
3903 //
3904 //   PerlTests  - Run Perl's regular expression tests
3905 //                The input file for this test is re_tests, the standard regular
3906 //                expression test data distributed with the Perl source code.
3907 //
3908 //                Here is Perl's description of the test data file:
3909 //
3910 //        # The tests are in a separate file 't/op/re_tests'.
3911 //        # Each line in that file is a separate test.
3912 //        # There are five columns, separated by tabs.
3913 //        #
3914 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3915 //        # Modifiers can be put after the closing C<'>.
3916 //        #
3917 //        # Column 2 contains the string to be matched.
3918 //        #
3919 //        # Column 3 contains the expected result:
3920 //        #     y   expect a match
3921 //        #     n   expect no match
3922 //        #     c   expect an error
3923 //        # B   test exposes a known bug in Perl, should be skipped
3924 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3925 //        #
3926 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3927 //        #
3928 //        # Column 4 contains a string, usually C<$&>.
3929 //        #
3930 //        # Column 5 contains the expected result of double-quote
3931 //        # interpolating that string after the match, or start of error message.
3932 //        #
3933 //        # Column 6, if present, contains a reason why the test is skipped.
3934 //        # This is printed with "skipped", for harness to pick up.
3935 //        #
3936 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3937 //        #
3938 //        # If you want to add a regular expression test that can't be expressed
3939 //        # in this format, don't add it here: put it in op/pat.t instead.
3940 //
3941 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3942 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3943 //        (The i is in addition to whatever was there before.)
3944 //
3945 //-------------------------------------------------------------------------------
3946 void RegexTest::PerlTests() {
3947     char tdd[2048];
3948     const char *srcPath;
3949     UErrorCode  status = U_ZERO_ERROR;
3950     UParseError pe;
3951
3952     //
3953     //  Open and read the test data file.
3954     //
3955     srcPath=getPath(tdd, "re_tests.txt");
3956     if(srcPath==NULL) {
3957         return; /* something went wrong, error already output */
3958     }
3959
3960     int32_t    len;
3961     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3962     if (U_FAILURE(status)) {
3963         return; /* something went wrong, error already output */
3964     }
3965
3966     //
3967     //  Put the test data into a UnicodeString
3968     //
3969     UnicodeString testDataString(FALSE, testData, len);
3970
3971     //
3972     //  Regex to break the input file into lines, and strip the new lines.
3973     //     One line per match, capture group one is the desired data.
3974     //
3975     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3976     if (U_FAILURE(status)) {
3977         dataerrln("RegexPattern::compile() error");
3978         return;
3979     }
3980     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3981
3982     //
3983     //  Regex to split a test file line into fields.
3984     //    There are six fields, separated by tabs.
3985     //
3986     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3987
3988     //
3989     //  Regex to identify test patterns with flag settings, and to separate them.
3990     //    Test patterns with flags look like 'pattern'i
3991     //    Test patterns without flags are not quoted:   pattern
3992     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3993     //
3994     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3995     RegexMatcher* flagMat = flagPat->matcher(status);
3996
3997     //
3998     // The Perl tests reference several perl-isms, which are evaluated/substituted
3999     //   in the test data.  Not being perl, this must be done explicitly.  Here
4000     //   are string constants and REs for these constructs.
4001     //
4002     UnicodeString nulnulSrc("${nulnul}");
4003     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4004     nulnul = nulnul.unescape();
4005
4006     UnicodeString ffffSrc("${ffff}");
4007     UnicodeString ffff("\\uffff", -1, US_INV);
4008     ffff = ffff.unescape();
4009
4010     //  regexp for $-[0], $+[2], etc.
4011     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4012     RegexMatcher *groupsMat = groupsPat->matcher(status);
4013
4014     //  regexp for $0, $1, $2, etc.
4015     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4016     RegexMatcher *cgMat = cgPat->matcher(status);
4017
4018
4019     //
4020     // Main Loop for the Perl Tests, runs once per line from the
4021     //   test data file.
4022     //
4023     int32_t  lineNum = 0;
4024     int32_t  skippedUnimplementedCount = 0;
4025     while (lineMat->find()) {
4026         lineNum++;
4027
4028         //
4029         //  Get a line, break it into its fields, do the Perl
4030         //    variable substitutions.
4031         //
4032         UnicodeString line = lineMat->group(1, status);
4033         UnicodeString fields[7];
4034         fieldPat->split(line, fields, 7, status);
4035
4036         flagMat->reset(fields[0]);
4037         flagMat->matches(status);
4038         UnicodeString pattern  = flagMat->group(2, status);
4039         pattern.findAndReplace("${bang}", "!");
4040         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4041         pattern.findAndReplace(ffffSrc, ffff);
4042
4043         //
4044         //  Identify patterns that include match flag settings,
4045         //    split off the flags, remove the extra quotes.
4046         //
4047         UnicodeString flagStr = flagMat->group(3, status);
4048         if (U_FAILURE(status)) {
4049             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4050             return;
4051         }
4052         int32_t flags = 0;
4053         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4054         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4055         const UChar UChar_m = 0x6d;
4056         const UChar UChar_x = 0x78;
4057         const UChar UChar_y = 0x79;
4058         if (flagStr.indexOf(UChar_i) != -1) {
4059             flags |= UREGEX_CASE_INSENSITIVE;
4060         }
4061         if (flagStr.indexOf(UChar_m) != -1) {
4062             flags |= UREGEX_MULTILINE;
4063         }
4064         if (flagStr.indexOf(UChar_x) != -1) {
4065             flags |= UREGEX_COMMENTS;
4066         }
4067
4068         //
4069         // Compile the test pattern.
4070         //
4071         status = U_ZERO_ERROR;
4072         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4073         if (status == U_REGEX_UNIMPLEMENTED) {
4074             //
4075             // Test of a feature that is planned for ICU, but not yet implemented.
4076             //   skip the test.
4077             skippedUnimplementedCount++;
4078             delete testPat;
4079             status = U_ZERO_ERROR;
4080             continue;
4081         }
4082
4083         if (U_FAILURE(status)) {
4084             // Some tests are supposed to generate errors.
4085             //   Only report an error for tests that are supposed to succeed.
4086             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4087                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4088             {
4089                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4090             }
4091             status = U_ZERO_ERROR;
4092             delete testPat;
4093             continue;
4094         }
4095
4096         if (fields[2].indexOf(UChar_i) >= 0) {
4097             // ICU should skip this test.
4098             delete testPat;
4099             continue;
4100         }
4101
4102         if (fields[2].indexOf(UChar_c) >= 0) {
4103             // This pattern should have caused a compilation error, but didn't/
4104             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4105             delete testPat;
4106             continue;
4107         }
4108
4109         //
4110         // replace the Perl variables that appear in some of the
4111         //   match data strings.
4112         //
4113         UnicodeString matchString = fields[1];
4114         matchString.findAndReplace(nulnulSrc, nulnul);
4115         matchString.findAndReplace(ffffSrc,   ffff);
4116
4117         // Replace any \n in the match string with an actual new-line char.
4118         //  Don't do full unescape, as this unescapes more than Perl does, which
4119         //  causes other spurious failures in the tests.
4120         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4121
4122
4123
4124         //
4125         // Run the test, check for expected match/don't match result.
4126         //
4127         RegexMatcher *testMat = testPat->matcher(matchString, status);
4128         UBool found = testMat->find();
4129         UBool expected = FALSE;
4130         if (fields[2].indexOf(UChar_y) >=0) {
4131             expected = TRUE;
4132         }
4133         if (expected != found) {
4134             errln("line %d: Expected %smatch, got %smatch",
4135                 lineNum, expected?"":"no ", found?"":"no " );
4136             continue;
4137         }
4138
4139         // Don't try to check expected results if there is no match.
4140         //   (Some have stuff in the expected fields)
4141         if (!found) {
4142             delete testMat;
4143             delete testPat;
4144             continue;
4145         }
4146
4147         //
4148         // Interpret the Perl expression from the fourth field of the data file,
4149         // building up an ICU string from the results of the ICU match.
4150         //   The Perl expression will contain references to the results of
4151         //     a regex match, including the matched string, capture group strings,
4152         //     group starting and ending indicies, etc.
4153         //
4154         UnicodeString resultString;
4155         UnicodeString perlExpr = fields[3];
4156 #if SUPPORT_MUTATING_INPUT_STRING
4157         groupsMat->reset(perlExpr);
4158         cgMat->reset(perlExpr);
4159 #endif
4160
4161         while (perlExpr.length() > 0) {
4162 #if !SUPPORT_MUTATING_INPUT_STRING
4163             //  Perferred usage.  Reset after any modification to input string.
4164             groupsMat->reset(perlExpr);
4165             cgMat->reset(perlExpr);
4166 #endif
4167
4168             if (perlExpr.startsWith("$&")) {
4169                 resultString.append(testMat->group(status));
4170                 perlExpr.remove(0, 2);
4171             }
4172
4173             else if (groupsMat->lookingAt(status)) {
4174                 // $-[0]   $+[2]  etc.
4175                 UnicodeString digitString = groupsMat->group(2, status);
4176                 int32_t t = 0;
4177                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4178                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4179                 int32_t matchPosition;
4180                 if (plusOrMinus.compare("+") == 0) {
4181                     matchPosition = testMat->end(groupNum, status);
4182                 } else {
4183                     matchPosition = testMat->start(groupNum, status);
4184                 }
4185                 if (matchPosition != -1) {
4186                     ICU_Utility::appendNumber(resultString, matchPosition);
4187                 }
4188                 perlExpr.remove(0, groupsMat->end(status));
4189             }
4190
4191             else if (cgMat->lookingAt(status)) {
4192                 // $1, $2, $3, etc.
4193                 UnicodeString digitString = cgMat->group(1, status);
4194                 int32_t t = 0;
4195                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4196                 if (U_SUCCESS(status)) {
4197                     resultString.append(testMat->group(groupNum, status));
4198                     status = U_ZERO_ERROR;
4199                 }
4200                 perlExpr.remove(0, cgMat->end(status));
4201             }
4202
4203             else if (perlExpr.startsWith("@-")) {
4204                 int32_t i;
4205                 for (i=0; i<=testMat->groupCount(); i++) {
4206                     if (i>0) {
4207                         resultString.append(" ");
4208                     }
4209                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4210                 }
4211                 perlExpr.remove(0, 2);
4212             }
4213
4214             else if (perlExpr.startsWith("@+")) {
4215                 int32_t i;
4216                 for (i=0; i<=testMat->groupCount(); i++) {
4217                     if (i>0) {
4218                         resultString.append(" ");
4219                     }
4220                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4221                 }
4222                 perlExpr.remove(0, 2);
4223             }
4224
4225             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4226                                                      //           or as an escaped sequence (e.g. \n)
4227                 if (perlExpr.length() > 1) {
4228                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4229                 }
4230                 UChar c = perlExpr.charAt(0);
4231                 switch (c) {
4232                 case 'n':   c = '\n'; break;
4233                 // add any other escape sequences that show up in the test expected results.
4234                 }
4235                 resultString.append(c);
4236                 perlExpr.remove(0, 1);
4237             }
4238
4239             else  {
4240                 // Any characters from the perl expression that we don't explicitly
4241                 //  recognize before here are assumed to be literals and copied
4242                 //  as-is to the expected results.
4243                 resultString.append(perlExpr.charAt(0));
4244                 perlExpr.remove(0, 1);
4245             }
4246
4247             if (U_FAILURE(status)) {
4248                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4249                 break;
4250             }
4251         }
4252
4253         //
4254         // Expected Results Compare
4255         //
4256         UnicodeString expectedS(fields[4]);
4257         expectedS.findAndReplace(nulnulSrc, nulnul);
4258         expectedS.findAndReplace(ffffSrc,   ffff);
4259         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4260
4261
4262         if (expectedS.compare(resultString) != 0) {
4263             err("Line %d: Incorrect perl expression results.", lineNum);
4264             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4265         }
4266
4267         delete testMat;
4268         delete testPat;
4269     }
4270
4271     //
4272     // All done.  Clean up allocated stuff.
4273     //
4274     delete cgMat;
4275     delete cgPat;
4276
4277     delete groupsMat;
4278     delete groupsPat;
4279
4280     delete flagMat;
4281     delete flagPat;
4282
4283     delete lineMat;
4284     delete linePat;
4285
4286     delete fieldPat;
4287     delete [] testData;
4288
4289
4290     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4291
4292 }
4293
4294
4295 //-------------------------------------------------------------------------------
4296 //
4297 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4298 //                  (instead of using UnicodeStrings) to test the alternate engine.
4299 //                  The input file for this test is re_tests, the standard regular
4300 //                  expression test data distributed with the Perl source code.
4301 //                  See PerlTests() for more information.
4302 //
4303 //-------------------------------------------------------------------------------
4304 void RegexTest::PerlTestsUTF8() {
4305     char tdd[2048];
4306     const char *srcPath;
4307     UErrorCode  status = U_ZERO_ERROR;
4308     UParseError pe;
4309     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4310     UText       patternText = UTEXT_INITIALIZER;
4311     char       *patternChars = NULL;
4312     int32_t     patternLength;
4313     int32_t     patternCapacity = 0;
4314     UText       inputText = UTEXT_INITIALIZER;
4315     char       *inputChars = NULL;
4316     int32_t     inputLength;
4317     int32_t     inputCapacity = 0;
4318
4319     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4320
4321     //
4322     //  Open and read the test data file.
4323     //
4324     srcPath=getPath(tdd, "re_tests.txt");
4325     if(srcPath==NULL) {
4326         return; /* something went wrong, error already output */
4327     }
4328
4329     int32_t    len;
4330     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4331     if (U_FAILURE(status)) {
4332         return; /* something went wrong, error already output */
4333     }
4334
4335     //
4336     //  Put the test data into a UnicodeString
4337     //
4338     UnicodeString testDataString(FALSE, testData, len);
4339
4340     //
4341     //  Regex to break the input file into lines, and strip the new lines.
4342     //     One line per match, capture group one is the desired data.
4343     //
4344     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4345     if (U_FAILURE(status)) {
4346         dataerrln("RegexPattern::compile() error");
4347         return;
4348     }
4349     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4350
4351     //
4352     //  Regex to split a test file line into fields.
4353     //    There are six fields, separated by tabs.
4354     //
4355     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4356
4357     //
4358     //  Regex to identify test patterns with flag settings, and to separate them.
4359     //    Test patterns with flags look like 'pattern'i
4360     //    Test patterns without flags are not quoted:   pattern
4361     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4362     //
4363     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4364     RegexMatcher* flagMat = flagPat->matcher(status);
4365
4366     //
4367     // The Perl tests reference several perl-isms, which are evaluated/substituted
4368     //   in the test data.  Not being perl, this must be done explicitly.  Here
4369     //   are string constants and REs for these constructs.
4370     //
4371     UnicodeString nulnulSrc("${nulnul}");
4372     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4373     nulnul = nulnul.unescape();
4374
4375     UnicodeString ffffSrc("${ffff}");
4376     UnicodeString ffff("\\uffff", -1, US_INV);
4377     ffff = ffff.unescape();
4378
4379     //  regexp for $-[0], $+[2], etc.
4380     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4381     RegexMatcher *groupsMat = groupsPat->matcher(status);
4382
4383     //  regexp for $0, $1, $2, etc.
4384     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4385     RegexMatcher *cgMat = cgPat->matcher(status);
4386
4387
4388     //
4389     // Main Loop for the Perl Tests, runs once per line from the
4390     //   test data file.
4391     //
4392     int32_t  lineNum = 0;
4393     int32_t  skippedUnimplementedCount = 0;
4394     while (lineMat->find()) {
4395         lineNum++;
4396
4397         //
4398         //  Get a line, break it into its fields, do the Perl
4399         //    variable substitutions.
4400         //
4401         UnicodeString line = lineMat->group(1, status);
4402         UnicodeString fields[7];
4403         fieldPat->split(line, fields, 7, status);
4404
4405         flagMat->reset(fields[0]);
4406         flagMat->matches(status);
4407         UnicodeString pattern  = flagMat->group(2, status);
4408         pattern.findAndReplace("${bang}", "!");
4409         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4410         pattern.findAndReplace(ffffSrc, ffff);
4411
4412         //
4413         //  Identify patterns that include match flag settings,
4414         //    split off the flags, remove the extra quotes.
4415         //
4416         UnicodeString flagStr = flagMat->group(3, status);
4417         if (U_FAILURE(status)) {
4418             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4419             return;
4420         }
4421         int32_t flags = 0;
4422         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4423         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4424         const UChar UChar_m = 0x6d;
4425         const UChar UChar_x = 0x78;
4426         const UChar UChar_y = 0x79;
4427         if (flagStr.indexOf(UChar_i) != -1) {
4428             flags |= UREGEX_CASE_INSENSITIVE;
4429         }
4430         if (flagStr.indexOf(UChar_m) != -1) {
4431             flags |= UREGEX_MULTILINE;
4432         }
4433         if (flagStr.indexOf(UChar_x) != -1) {
4434             flags |= UREGEX_COMMENTS;
4435         }
4436
4437         //
4438         // Put the pattern in a UTF-8 UText
4439         //
4440         status = U_ZERO_ERROR;
4441         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4442         if (status == U_BUFFER_OVERFLOW_ERROR) {
4443             status = U_ZERO_ERROR;
4444             delete[] patternChars;
4445             patternCapacity = patternLength + 1;
4446             patternChars = new char[patternCapacity];
4447             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4448         }
4449         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4450
4451         //
4452         // Compile the test pattern.
4453         //
4454         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4455         if (status == U_REGEX_UNIMPLEMENTED) {
4456             //
4457             // Test of a feature that is planned for ICU, but not yet implemented.
4458             //   skip the test.
4459             skippedUnimplementedCount++;
4460             delete testPat;
4461             status = U_ZERO_ERROR;
4462             continue;
4463         }
4464
4465         if (U_FAILURE(status)) {
4466             // Some tests are supposed to generate errors.
4467             //   Only report an error for tests that are supposed to succeed.
4468             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4469                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4470             {
4471                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4472             }
4473             status = U_ZERO_ERROR;
4474             delete testPat;
4475             continue;
4476         }
4477
4478         if (fields[2].indexOf(UChar_i) >= 0) {
4479             // ICU should skip this test.
4480             delete testPat;
4481             continue;
4482         }
4483
4484         if (fields[2].indexOf(UChar_c) >= 0) {
4485             // This pattern should have caused a compilation error, but didn't/
4486             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4487             delete testPat;
4488             continue;
4489         }
4490
4491
4492         //
4493         // replace the Perl variables that appear in some of the
4494         //   match data strings.
4495         //
4496         UnicodeString matchString = fields[1];
4497         matchString.findAndReplace(nulnulSrc, nulnul);
4498         matchString.findAndReplace(ffffSrc,   ffff);
4499
4500         // Replace any \n in the match string with an actual new-line char.
4501         //  Don't do full unescape, as this unescapes more than Perl does, which
4502         //  causes other spurious failures in the tests.
4503         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4504
4505         //
4506         // Put the input in a UTF-8 UText
4507         //
4508         status = U_ZERO_ERROR;
4509         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4510         if (status == U_BUFFER_OVERFLOW_ERROR) {
4511             status = U_ZERO_ERROR;
4512             delete[] inputChars;
4513             inputCapacity = inputLength + 1;
4514             inputChars = new char[inputCapacity];
4515             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4516         }
4517         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4518
4519         //
4520         // Run the test, check for expected match/don't match result.
4521         //
4522         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4523         UBool found = testMat->find();
4524         UBool expected = FALSE;
4525         if (fields[2].indexOf(UChar_y) >=0) {
4526             expected = TRUE;
4527         }
4528         if (expected != found) {
4529             errln("line %d: Expected %smatch, got %smatch",
4530                 lineNum, expected?"":"no ", found?"":"no " );
4531             continue;
4532         }
4533
4534         // Don't try to check expected results if there is no match.
4535         //   (Some have stuff in the expected fields)
4536         if (!found) {
4537             delete testMat;
4538             delete testPat;
4539             continue;
4540         }
4541
4542         //
4543         // Interpret the Perl expression from the fourth field of the data file,
4544         // building up an ICU string from the results of the ICU match.
4545         //   The Perl expression will contain references to the results of
4546         //     a regex match, including the matched string, capture group strings,
4547         //     group starting and ending indicies, etc.
4548         //
4549         UnicodeString resultString;
4550         UnicodeString perlExpr = fields[3];
4551
4552         while (perlExpr.length() > 0) {
4553             groupsMat->reset(perlExpr);
4554             cgMat->reset(perlExpr);
4555
4556             if (perlExpr.startsWith("$&")) {
4557                 resultString.append(testMat->group(status));
4558                 perlExpr.remove(0, 2);
4559             }
4560
4561             else if (groupsMat->lookingAt(status)) {
4562                 // $-[0]   $+[2]  etc.
4563                 UnicodeString digitString = groupsMat->group(2, status);
4564                 int32_t t = 0;
4565                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4566                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4567                 int32_t matchPosition;
4568                 if (plusOrMinus.compare("+") == 0) {
4569                     matchPosition = testMat->end(groupNum, status);
4570                 } else {
4571                     matchPosition = testMat->start(groupNum, status);
4572                 }
4573                 if (matchPosition != -1) {
4574                     ICU_Utility::appendNumber(resultString, matchPosition);
4575                 }
4576                 perlExpr.remove(0, groupsMat->end(status));
4577             }
4578
4579             else if (cgMat->lookingAt(status)) {
4580                 // $1, $2, $3, etc.
4581                 UnicodeString digitString = cgMat->group(1, status);
4582                 int32_t t = 0;
4583                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4584                 if (U_SUCCESS(status)) {
4585                     resultString.append(testMat->group(groupNum, status));
4586                     status = U_ZERO_ERROR;
4587                 }
4588                 perlExpr.remove(0, cgMat->end(status));
4589             }
4590
4591             else if (perlExpr.startsWith("@-")) {
4592                 int32_t i;
4593                 for (i=0; i<=testMat->groupCount(); i++) {
4594                     if (i>0) {
4595                         resultString.append(" ");
4596                     }
4597                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4598                 }
4599                 perlExpr.remove(0, 2);
4600             }
4601
4602             else if (perlExpr.startsWith("@+")) {
4603                 int32_t i;
4604                 for (i=0; i<=testMat->groupCount(); i++) {
4605                     if (i>0) {
4606                         resultString.append(" ");
4607                     }
4608                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4609                 }
4610                 perlExpr.remove(0, 2);
4611             }
4612
4613             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4614                                                      //           or as an escaped sequence (e.g. \n)
4615                 if (perlExpr.length() > 1) {
4616                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4617                 }
4618                 UChar c = perlExpr.charAt(0);
4619                 switch (c) {
4620                 case 'n':   c = '\n'; break;
4621                 // add any other escape sequences that show up in the test expected results.
4622                 }
4623                 resultString.append(c);
4624                 perlExpr.remove(0, 1);
4625             }
4626
4627             else  {
4628                 // Any characters from the perl expression that we don't explicitly
4629                 //  recognize before here are assumed to be literals and copied
4630                 //  as-is to the expected results.
4631                 resultString.append(perlExpr.charAt(0));
4632                 perlExpr.remove(0, 1);
4633             }
4634
4635             if (U_FAILURE(status)) {
4636                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4637                 break;
4638             }
4639         }
4640
4641         //
4642         // Expected Results Compare
4643         //
4644         UnicodeString expectedS(fields[4]);
4645         expectedS.findAndReplace(nulnulSrc, nulnul);
4646         expectedS.findAndReplace(ffffSrc,   ffff);
4647         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4648
4649
4650         if (expectedS.compare(resultString) != 0) {
4651             err("Line %d: Incorrect perl expression results.", lineNum);
4652             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4653         }
4654
4655         delete testMat;
4656         delete testPat;
4657     }
4658
4659     //
4660     // All done.  Clean up allocated stuff.
4661     //
4662     delete cgMat;
4663     delete cgPat;
4664
4665     delete groupsMat;
4666     delete groupsPat;
4667
4668     delete flagMat;
4669     delete flagPat;
4670
4671     delete lineMat;
4672     delete linePat;
4673
4674     delete fieldPat;
4675     delete [] testData;
4676
4677     utext_close(&patternText);
4678     utext_close(&inputText);
4679
4680     delete [] patternChars;
4681     delete [] inputChars;
4682
4683
4684     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4685
4686 }
4687
4688
4689 //--------------------------------------------------------------
4690 //
4691 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4692 //             Use this pattern,
4693 //                 "(a?){1,8000000}"
4694 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4695 //                   This test is likely to be fragile, as further optimizations stop
4696 //                   more cases of pointless looping in the match engine.
4697 //
4698 //---------------------------------------------------------------
4699 void RegexTest::Bug6149() {
4700     UnicodeString pattern("(a?){1,8000000}");
4701     UnicodeString s("xyz");
4702     uint32_t flags = 0;
4703     UErrorCode status = U_ZERO_ERROR;
4704
4705     RegexMatcher  matcher(pattern, s, flags, status);
4706     UBool result = false;
4707     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4708     REGEX_ASSERT(result == FALSE);
4709  }
4710
4711
4712 //
4713 //   Callbacks()    Test the callback function.
4714 //                  When set, callbacks occur periodically during matching operations,
4715 //                  giving the application code the ability to abort the operation
4716 //                  before it's normal completion.
4717 //
4718
4719 struct callBackContext {
4720     RegexTest        *test;
4721     int32_t          maxCalls;
4722     int32_t          numCalls;
4723     int32_t          lastSteps;
4724     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4725 };
4726
4727 U_CDECL_BEGIN
4728 static UBool U_CALLCONV
4729 testCallBackFn(const void *context, int32_t steps) {
4730     callBackContext  *info = (callBackContext *)context;
4731     if (info->lastSteps+1 != steps) {
4732         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4733     }
4734     info->lastSteps = steps;
4735     info->numCalls++;
4736     return (info->numCalls < info->maxCalls);
4737 }
4738 U_CDECL_END
4739
4740 void RegexTest::Callbacks() {
4741    {
4742         // Getter returns NULLs if no callback has been set
4743
4744         //   The variables that the getter will fill in.
4745         //   Init to non-null values so that the action of the getter can be seen.
4746         const void          *returnedContext = &returnedContext;
4747         URegexMatchCallback *returnedFn = &testCallBackFn;
4748
4749         UErrorCode status = U_ZERO_ERROR;
4750         RegexMatcher matcher("x", 0, status);
4751         REGEX_CHECK_STATUS;
4752         matcher.getMatchCallback(returnedFn, returnedContext, status);
4753         REGEX_CHECK_STATUS;
4754         REGEX_ASSERT(returnedFn == NULL);
4755         REGEX_ASSERT(returnedContext == NULL);
4756     }
4757
4758    {
4759         // Set and Get work
4760         callBackContext cbInfo = {this, 0, 0, 0};
4761         const void          *returnedContext;
4762         URegexMatchCallback *returnedFn;
4763         UErrorCode status = U_ZERO_ERROR;
4764         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4765         REGEX_CHECK_STATUS;
4766         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4767         REGEX_CHECK_STATUS;
4768         matcher.getMatchCallback(returnedFn, returnedContext, status);
4769         REGEX_CHECK_STATUS;
4770         REGEX_ASSERT(returnedFn == testCallBackFn);
4771         REGEX_ASSERT(returnedContext == &cbInfo);
4772
4773         // A short-running match shouldn't invoke the callback
4774         status = U_ZERO_ERROR;
4775         cbInfo.reset(1);
4776         UnicodeString s = "xxx";
4777         matcher.reset(s);
4778         REGEX_ASSERT(matcher.matches(status));
4779         REGEX_CHECK_STATUS;
4780         REGEX_ASSERT(cbInfo.numCalls == 0);
4781
4782         // A medium-length match that runs long enough to invoke the
4783         //   callback, but not so long that the callback aborts it.
4784         status = U_ZERO_ERROR;
4785         cbInfo.reset(4);
4786         s = "aaaaaaaaaaaaaaaaaaab";
4787         matcher.reset(s);
4788         REGEX_ASSERT(matcher.matches(status)==FALSE);
4789         REGEX_CHECK_STATUS;
4790         REGEX_ASSERT(cbInfo.numCalls > 0);
4791
4792         // A longer running match that the callback function will abort.
4793         status = U_ZERO_ERROR;
4794         cbInfo.reset(4);
4795         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4796         matcher.reset(s);
4797         REGEX_ASSERT(matcher.matches(status)==FALSE);
4798         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4799         REGEX_ASSERT(cbInfo.numCalls == 4);
4800     }
4801
4802
4803 }
4804
4805
4806 //
4807 //   FindProgressCallbacks()    Test the find "progress" callback function.
4808 //                  When set, the find progress callback will be invoked during a find operations
4809 //                  after each return from a match attempt, giving the application the opportunity
4810 //                  to terminate a long-running find operation before it's normal completion.
4811 //
4812
4813 struct progressCallBackContext {
4814     RegexTest        *test;
4815     int64_t          lastIndex;
4816     int32_t          maxCalls;
4817     int32_t          numCalls;
4818     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4819 };
4820
4821 U_CDECL_BEGIN
4822 static UBool U_CALLCONV
4823 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4824     progressCallBackContext  *info = (progressCallBackContext *)context;
4825     info->numCalls++;
4826     info->lastIndex = matchIndex;
4827 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4828     return (info->numCalls < info->maxCalls);
4829 }
4830 U_CDECL_END
4831
4832 void RegexTest::FindProgressCallbacks() {
4833    {
4834         // Getter returns NULLs if no callback has been set
4835
4836         //   The variables that the getter will fill in.
4837         //   Init to non-null values so that the action of the getter can be seen.
4838         const void                  *returnedContext = &returnedContext;
4839         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4840
4841         UErrorCode status = U_ZERO_ERROR;
4842         RegexMatcher matcher("x", 0, status);
4843         REGEX_CHECK_STATUS;
4844         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4845         REGEX_CHECK_STATUS;
4846         REGEX_ASSERT(returnedFn == NULL);
4847         REGEX_ASSERT(returnedContext == NULL);
4848     }
4849
4850    {
4851         // Set and Get work
4852         progressCallBackContext cbInfo = {this, 0, 0, 0};
4853         const void                  *returnedContext;
4854         URegexFindProgressCallback  *returnedFn;
4855         UErrorCode status = U_ZERO_ERROR;
4856         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4857         REGEX_CHECK_STATUS;
4858         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4859         REGEX_CHECK_STATUS;
4860         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4861         REGEX_CHECK_STATUS;
4862         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4863         REGEX_ASSERT(returnedContext == &cbInfo);
4864
4865         // A short-running match should NOT invoke the callback.
4866         status = U_ZERO_ERROR;
4867         cbInfo.reset(100);
4868         UnicodeString s = "abxxx";
4869         matcher.reset(s);
4870 #if 0
4871         matcher.setTrace(TRUE);
4872 #endif
4873         REGEX_ASSERT(matcher.find(0, status));
4874         REGEX_CHECK_STATUS;
4875         REGEX_ASSERT(cbInfo.numCalls == 0);
4876
4877         // A medium running match that causes matcher.find() to invoke our callback for each index.
4878         status = U_ZERO_ERROR;
4879         s = "aaaaaaaaaaaaaaaaaaab";
4880         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4881         matcher.reset(s);
4882         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4883         REGEX_CHECK_STATUS;
4884         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4885
4886         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4887         status = U_ZERO_ERROR;
4888         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4889         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4890         matcher.reset(s1);
4891         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4892         REGEX_CHECK_STATUS;
4893         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4894
4895 #if 0
4896         // Now a match that will succeed, but after an interruption
4897         status = U_ZERO_ERROR;
4898         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4899         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4900         matcher.reset(s2);
4901         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4902         REGEX_CHECK_STATUS;
4903         // Now retry the match from where left off
4904         cbInfo.maxCalls = 100; //  No callback limit
4905         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4906         REGEX_CHECK_STATUS;
4907 #endif
4908     }
4909
4910
4911 }
4912
4913
4914 //---------------------------------------------------------------------------
4915 //
4916 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4917 //                             UTexts. The pure-C implementation of UText
4918 //                             has no mutable backing stores, but we can
4919 //                             use UnicodeString here to test the functionality.
4920 //
4921 //---------------------------------------------------------------------------
4922 void RegexTest::PreAllocatedUTextCAPI () {
4923     UErrorCode           status = U_ZERO_ERROR;
4924     URegularExpression  *re;
4925     UText                patternText = UTEXT_INITIALIZER;
4926     UnicodeString        buffer;
4927     UText                bufferText = UTEXT_INITIALIZER;
4928
4929     utext_openUnicodeString(&bufferText, &buffer, &status);
4930
4931     /*
4932      *  getText() and getUText()
4933      */
4934     {
4935         UText  text1 = UTEXT_INITIALIZER;
4936         UText  text2 = UTEXT_INITIALIZER;
4937         UChar  text2Chars[20];
4938         UText  *resultText;
4939
4940         status = U_ZERO_ERROR;
4941         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4942         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4943         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4944         utext_openUChars(&text2, text2Chars, -1, &status);
4945
4946         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4947         re = uregex_openUText(&patternText, 0, NULL, &status);
4948
4949         /* First set a UText */
4950         uregex_setUText(re, &text1, &status);
4951         resultText = uregex_getUText(re, &bufferText, &status);
4952         REGEX_CHECK_STATUS;
4953         REGEX_ASSERT(resultText == &bufferText);
4954         utext_setNativeIndex(resultText, 0);
4955         utext_setNativeIndex(&text1, 0);
4956         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4957
4958         resultText = uregex_getUText(re, &bufferText, &status);
4959         REGEX_CHECK_STATUS;
4960         REGEX_ASSERT(resultText == &bufferText);
4961         utext_setNativeIndex(resultText, 0);
4962         utext_setNativeIndex(&text1, 0);
4963         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4964
4965         /* Then set a UChar * */
4966         uregex_setText(re, text2Chars, 7, &status);
4967         resultText = uregex_getUText(re, &bufferText, &status);
4968         REGEX_CHECK_STATUS;
4969         REGEX_ASSERT(resultText == &bufferText);
4970         utext_setNativeIndex(resultText, 0);
4971         utext_setNativeIndex(&text2, 0);
4972         REGEX_ASSERT(testUTextEqual(resultText, &text2));
4973
4974         uregex_close(re);
4975         utext_close(&text1);
4976         utext_close(&text2);
4977     }
4978
4979     /*
4980      *  group()
4981      */
4982     {
4983         UChar    text1[80];
4984         UText   *actual;
4985         UBool    result;
4986         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4987
4988         status = U_ZERO_ERROR;
4989         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4990         REGEX_CHECK_STATUS;
4991
4992         uregex_setText(re, text1, -1, &status);
4993         result = uregex_find(re, 0, &status);
4994         REGEX_ASSERT(result==TRUE);
4995
4996         /*  Capture Group 0, the full match.  Should succeed.  */
4997         status = U_ZERO_ERROR;
4998         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4999         REGEX_CHECK_STATUS;
5000         REGEX_ASSERT(actual == &bufferText);
5001         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
5002
5003         /*  Capture group #1.  Should succeed. */
5004         status = U_ZERO_ERROR;
5005         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
5006         REGEX_CHECK_STATUS;
5007         REGEX_ASSERT(actual == &bufferText);
5008         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5009
5010         /*  Capture group out of range.  Error. */
5011         status = U_ZERO_ERROR;
5012         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5013         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5014         REGEX_ASSERT(actual == &bufferText);
5015
5016         uregex_close(re);
5017
5018     }
5019
5020     /*
5021      *  replaceFirst()
5022      */
5023     {
5024         UChar    text1[80];
5025         UChar    text2[80];
5026         UText    replText = UTEXT_INITIALIZER;
5027         UText   *result;
5028
5029         status = U_ZERO_ERROR;
5030         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5031         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5032         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5033
5034         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5035         REGEX_CHECK_STATUS;
5036
5037         /*  Normal case, with match */
5038         uregex_setText(re, text1, -1, &status);
5039         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5040         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5041         REGEX_CHECK_STATUS;
5042         REGEX_ASSERT(result == &bufferText);
5043         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5044
5045         /* No match.  Text should copy to output with no changes.  */
5046         uregex_setText(re, text2, -1, &status);
5047         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5048         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5049         REGEX_CHECK_STATUS;
5050         REGEX_ASSERT(result == &bufferText);
5051         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5052
5053         /* Unicode escapes */
5054         uregex_setText(re, text1, -1, &status);
5055         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5056         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5057         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5058         REGEX_CHECK_STATUS;
5059         REGEX_ASSERT(result == &bufferText);
5060         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5061
5062         uregex_close(re);
5063         utext_close(&replText);
5064     }
5065
5066
5067     /*
5068      *  replaceAll()
5069      */
5070     {
5071         UChar    text1[80];
5072         UChar    text2[80];
5073         UText    replText = UTEXT_INITIALIZER;
5074         UText   *result;
5075
5076         status = U_ZERO_ERROR;
5077         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5078         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5079         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5080
5081         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5082         REGEX_CHECK_STATUS;
5083
5084         /*  Normal case, with match */
5085         uregex_setText(re, text1, -1, &status);
5086         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5087         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5088         REGEX_CHECK_STATUS;
5089         REGEX_ASSERT(result == &bufferText);
5090         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5091
5092         /* No match.  Text should copy to output with no changes.  */
5093         uregex_setText(re, text2, -1, &status);
5094         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5095         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5096         REGEX_CHECK_STATUS;
5097         REGEX_ASSERT(result == &bufferText);
5098         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5099
5100         uregex_close(re);
5101         utext_close(&replText);
5102     }
5103
5104
5105     /*
5106      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5107      *   so we don't need to test it here.
5108      */
5109
5110     utext_close(&bufferText);
5111     utext_close(&patternText);
5112 }
5113
5114 //--------------------------------------------------------------
5115 //
5116 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5117 //
5118 //---------------------------------------------------------------
5119 void RegexTest::Bug7651() {
5120     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5121     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5122     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5123     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5124     UnicodeString s("#ff @abcd This is test");
5125     RegexPattern  *REPattern = NULL;
5126     RegexMatcher  *REMatcher = NULL;
5127     UErrorCode status = U_ZERO_ERROR;
5128     UParseError pe;
5129
5130     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5131     REGEX_CHECK_STATUS;
5132     REMatcher = REPattern->matcher(s, status);
5133     REGEX_CHECK_STATUS;
5134     REGEX_ASSERT(REMatcher->find());
5135     REGEX_ASSERT(REMatcher->start(status) == 0);
5136     delete REPattern;
5137     delete REMatcher;
5138     status = U_ZERO_ERROR;
5139
5140     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5141     REGEX_CHECK_STATUS;
5142     REMatcher = REPattern->matcher(s, status);
5143     REGEX_CHECK_STATUS;
5144     REGEX_ASSERT(REMatcher->find());
5145     REGEX_ASSERT(REMatcher->start(status) == 0);
5146     delete REPattern;
5147     delete REMatcher;
5148     status = U_ZERO_ERROR;
5149  }
5150
5151 void RegexTest::Bug7740() {
5152     UErrorCode status = U_ZERO_ERROR;
5153     UnicodeString pattern = "(a)";
5154     UnicodeString text = "abcdef";
5155     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5156     REGEX_CHECK_STATUS;
5157     REGEX_ASSERT(m->lookingAt(status));
5158     REGEX_CHECK_STATUS;
5159     status = U_ILLEGAL_ARGUMENT_ERROR;
5160     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5161     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5162     REGEX_ASSERT(s == "");
5163     delete m;
5164 }
5165
5166 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5167
5168 void RegexTest::Bug8479() {
5169     UErrorCode status = U_ZERO_ERROR;
5170
5171     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5172     REGEX_CHECK_STATUS;
5173     if (U_SUCCESS(status))
5174     {
5175         UnicodeString str;
5176         str.setToBogus();
5177         pMatcher->reset(str);
5178         status = U_ZERO_ERROR;
5179         pMatcher->matches(status);
5180         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5181         delete pMatcher;
5182     }
5183 }
5184
5185
5186 // Bug 7029
5187 void RegexTest::Bug7029() {
5188     UErrorCode status = U_ZERO_ERROR;
5189
5190     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5191     UnicodeString text = "abc.def";
5192     UnicodeString splits[10];
5193     REGEX_CHECK_STATUS;
5194     int32_t numFields = pMatcher->split(text, splits, 10, status);
5195     REGEX_CHECK_STATUS;
5196     REGEX_ASSERT(numFields == 8);
5197     delete pMatcher;
5198 }
5199
5200 // Bug 9283
5201 //   This test is checking for the existance of any supplemental characters that case-fold
5202 //   to a bmp character.
5203 //
5204 //   At the time of this writing there are none. If any should appear in a subsequent release
5205 //   of Unicode, the code in regular expressions compilation that determines the longest
5206 //   posssible match for a literal string  will need to be enhanced.
5207 //
5208 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5209 //   for details on what to do in case of a failure of this test.
5210 //
5211 void RegexTest::Bug9283() {
5212 #if !UCONFIG_NO_NORMALIZATION
5213     UErrorCode status = U_ZERO_ERROR;
5214     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5215     REGEX_CHECK_STATUS;
5216     int32_t index;
5217     UChar32 c;
5218     for (index=0; ; index++) {
5219         c = supplementalsWithCaseFolding.charAt(index);
5220         if (c == -1) {
5221             break;
5222         }
5223         UnicodeString cf = UnicodeString(c).foldCase();
5224         REGEX_ASSERT(cf.length() >= 2);
5225     }
5226 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5227 }
5228
5229
5230 void RegexTest::CheckInvBufSize() {
5231   if(inv_next>=INV_BUFSIZ) {
5232     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5233           __FILE__, INV_BUFSIZ, inv_next);
5234   } else {
5235     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5236   }
5237 }
5238
5239
5240 void RegexTest::Bug10459() {
5241     UErrorCode status = U_ZERO_ERROR;
5242     UnicodeString patternString("(txt)");
5243     UnicodeString txtString("txt");
5244
5245     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5246     REGEX_CHECK_STATUS;
5247     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5248     REGEX_CHECK_STATUS;
5249
5250     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5251     REGEX_CHECK_STATUS;
5252
5253     uregex_setUText(icu_re, utext_txt, &status);
5254     REGEX_CHECK_STATUS;
5255
5256     // The bug was that calling uregex_group() before doing a matching operation
5257     //   was causing a segfault. Only for Regular Expressions created from UText.
5258     //   It should set an U_REGEX_INVALID_STATE.
5259
5260     UChar buf[100];
5261     int32_t len = uregex_group(icu_re, 0, buf, LENGTHOF(buf), &status);
5262     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5263     REGEX_ASSERT(len == 0);
5264
5265     uregex_close(icu_re);
5266     utext_close(utext_pat);
5267     utext_close(utext_txt);
5268 }
5269
5270 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5271