icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2015, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 /*
  14      NOTE!!
  15
  16      PLEASE be careful about ASCII assumptions in this test.
  17      This test is one of the worst repeat offenders.
  18      If you have questions, contact someone on the ICU PMC
  19      who has access to an EBCDIC system.
  20
  21  */
  22
  23 #include "intltest.h"
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  25
  26 #include "unicode/localpointer.h"
  27 #include "unicode/regex.h"
  28 #include "unicode/uchar.h"
  29 #include "unicode/ucnv.h"
  30 #include "unicode/uniset.h"
  31 #include "unicode/uregex.h"
  32 #include "unicode/usetiter.h"
  33 #include "unicode/ustring.h"
  34 #include "regextst.h"
  35 #include "regexcmp.h"
  36 #include "uvector.h"
  37 #include "util.h"
  38 #include <stdlib.h>
  39 #include <string.h>
  40 #include <stdio.h>
  41 #include "cmemory.h"
  42 #include "cstring.h"
  43 #include "uinvchar.h"
  44
  45 #define SUPPORT_MUTATING_INPUT_STRING   0
  46
  47 //---------------------------------------------------------------------------
  48 //
  49 //  Test class boilerplate
  50 //
  51 //---------------------------------------------------------------------------
  52 RegexTest::RegexTest()
  53 {
  54 }
  55
  56
  57 RegexTest::~RegexTest()
  58 {
  59 }
  60
  61
  62
  63 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  64 {
  65     if (exec) logln("TestSuite RegexTest: ");
  66     switch (index) {
  67
  68         case 0: name = "Basic";
  69             if (exec) Basic();
  70             break;
  71         case 1: name = "API_Match";
  72             if (exec) API_Match();
  73             break;
  74         case 2: name = "API_Replace";
  75             if (exec) API_Replace();
  76             break;
  77         case 3: name = "API_Pattern";
  78             if (exec) API_Pattern();
  79             break;
  80         case 4:
  81 #if !UCONFIG_NO_FILE_IO
  82             name = "Extended";
  83             if (exec) Extended();
  84 #else
  85             name = "skip";
  86 #endif
  87             break;
  88         case 5: name = "Errors";
  89             if (exec) Errors();
  90             break;
  91         case 6: name = "PerlTests";
  92             if (exec) PerlTests();
  93             break;
  94         case 7: name = "Callbacks";
  95             if (exec) Callbacks();
  96             break;
  97         case 8: name = "FindProgressCallbacks";
  98             if (exec) FindProgressCallbacks();
  99             break;
 100         case 9: name = "Bug 6149";
 101              if (exec) Bug6149();
 102              break;
 103         case 10: name = "UTextBasic";
 104           if (exec) UTextBasic();
 105           break;
 106         case 11: name = "API_Match_UTF8";
 107           if (exec) API_Match_UTF8();
 108           break;
 109         case 12: name = "API_Replace_UTF8";
 110           if (exec) API_Replace_UTF8();
 111           break;
 112         case 13: name = "API_Pattern_UTF8";
 113           if (exec) API_Pattern_UTF8();
 114           break;
 115         case 14: name = "PerlTestsUTF8";
 116           if (exec) PerlTestsUTF8();
 117           break;
 118         case 15: name = "PreAllocatedUTextCAPI";
 119           if (exec) PreAllocatedUTextCAPI();
 120           break;
 121         case 16: name = "Bug 7651";
 122              if (exec) Bug7651();
 123              break;
 124         case 17: name = "Bug 7740";
 125             if (exec) Bug7740();
 126             break;
 127         case 18: name = "Bug 8479";
 128             if (exec) Bug8479();
 129             break;
 130         case 19: name = "Bug 7029";
 131             if (exec) Bug7029();
 132             break;
 133         case 20: name = "CheckInvBufSize";
 134             if (exec) CheckInvBufSize();
 135             break;
 136         case 21: name = "Bug 9283";
 137             if (exec) Bug9283();
 138             break;
 139         case 22: name = "Bug10459";
 140             if (exec) Bug10459();
 141             break;
 142         case 23: name = "TestCaseInsensitiveStarters";
 143             if (exec) TestCaseInsensitiveStarters();
 144             break;
 145         case 24: name = "TestBug11049";
 146             if (exec) TestBug11049();
 147             break;
 148         case 25: name = "TestBug11371";
 149             if (exec) TestBug11371();
 150             break;
 151         case 26: name = "TestBug11480";
 152             if (exec) TestBug11480();
 153             break;
 154         case 27: name = "NamedCapture";
 155             if (exec) NamedCapture();
 156             break;
 157         case 28: name = "NamedCaptureLimits";
 158             if (exec) NamedCaptureLimits();
 159             break;
 160         default: name = "";
 161             break; //needed to end loop
 162     }
 163 }
 164
 165
 166
 167 /**
 168  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 169  * into ASCII.
 170  * @see utext_openUTF8
 171  */
 172 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 173
 174 //---------------------------------------------------------------------------
 175 //
 176 //   Error Checking / Reporting macros used in all of the tests.
 177 //
 178 //---------------------------------------------------------------------------
 179
 180 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 181   int64_t oldIndex = utext_getNativeIndex(text);
 182   utext_setNativeIndex(text, 0);
 183   char *bufPtr = buf;
 184   UChar32 c = utext_next32From(text, 0);
 185   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 186     if (0x000020<=c && c<0x00007e) {
 187       *bufPtr = c;
 188     } else {
 189 #if 0
 190       sprintf(bufPtr,"U+%04X", c);
 191       bufPtr+= strlen(bufPtr)-1;
 192 #else
 193       *bufPtr = '%';
 194 #endif
 195     }
 196     bufPtr++;
 197     c = UTEXT_NEXT32(text);
 198   }
 199   *bufPtr = 0;
 200 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 201   char *ebuf = (char*)malloc(bufLen);
 202   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 203   uprv_strncpy(buf, ebuf, bufLen);
 204   free((void*)ebuf);
 205 #endif
 206   utext_setNativeIndex(text, oldIndex);
 207 }
 208
 209
 210 static char ASSERT_BUF[1024];
 211
 212 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 213   if(message.length()==0) {
 214     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 215   } else {
 216     UnicodeString buf;
 217     IntlTest::prettify(message,buf);
 218     if(buf.length()==0) {
 219       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 220     } else {
 221       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 222       if(ASSERT_BUF[0]==0) {
 223         ASSERT_BUF[0]=0;
 224         for(int32_t i=0;i<buf.length();i++) {
 225           UChar ch = buf[i];
 226           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 227         }
 228       }
 229     }
 230   }
 231   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 232   return ASSERT_BUF;
 233 }
 234
 235 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 236
 237 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 238                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 239
 240 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 241
 242 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 243 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 244     __LINE__, u_errorName(errcode), u_errorName(status));};}
 245
 246 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 247     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 248
 249 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 250     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 251
 252 // expected: const char * , restricted to invariant characters.
 253 // actual: const UnicodeString &
 254 #define REGEX_ASSERT_UNISTR(expected, actual) { \
 255     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
 256         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
 257                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
 258
 259
 260 static UBool testUTextEqual(UText *uta, UText *utb) {
 261     UChar32 ca = 0;
 262     UChar32 cb = 0;
 263     utext_setNativeIndex(uta, 0);
 264     utext_setNativeIndex(utb, 0);
 265     do {
 266         ca = utext_next32(uta);
 267         cb = utext_next32(utb);
 268         if (ca != cb) {
 269             break;
 270         }
 271     } while (ca != U_SENTINEL);
 272     return ca == cb;
 273 }
 274
 275
 276 /**
 277  * @param expected expected text in UTF-8 (not platform) codepage
 278  */
 279 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 280     UErrorCode status = U_ZERO_ERROR;
 281     UText expectedText = UTEXT_INITIALIZER;
 282     utext_openUTF8(&expectedText, expected, -1, &status);
 283     if(U_FAILURE(status)) {
 284       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 285       return;
 286     }
 287     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 288       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 289       return;
 290     }
 291     utext_setNativeIndex(actual, 0);
 292     if (!testUTextEqual(&expectedText, actual)) {
 293         char buf[201 /*21*/];
 294         char expectedBuf[201];
 295         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 296         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 297         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 298     }
 299     utext_close(&expectedText);
 300 }
 301 /**
 302  * @param expected invariant (platform local text) input
 303  */
 304
 305 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 306     UErrorCode status = U_ZERO_ERROR;
 307     UText expectedText = UTEXT_INITIALIZER;
 308     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 309     if(U_FAILURE(status)) {
 310       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 311       return;
 312     }
 313     utext_setNativeIndex(actual, 0);
 314     if (!testUTextEqual(&expectedText, actual)) {
 315         char buf[201 /*21*/];
 316         char expectedBuf[201];
 317         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 318         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 319         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 320     }
 321     utext_close(&expectedText);
 322 }
 323
 324 /**
 325  * Assumes utf-8 input
 326  */
 327 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 328 /**
 329  * Assumes Invariant input
 330  */
 331 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 332
 333 /**
 334  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 335  * passed into utext_openUTF8. An error will be given if
 336  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 337  */
 338
 339 #define INV_BUFSIZ 2048 /* increase this if too small */
 340
 341 static int64_t inv_next=0;
 342
 343 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 344 static char inv_buf[INV_BUFSIZ];
 345 #endif
 346
 347 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 348   if(length==-1) length=strlen(inv);
 349 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 350   inv_next+=length;
 351   return utext_openUTF8(ut, inv, length, status);
 352 #else
 353   if(inv_next+length+1>INV_BUFSIZ) {
 354     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 355             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 356     *status = U_MEMORY_ALLOCATION_ERROR;
 357     return NULL;
 358   }
 359
 360   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 361   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 362   inv_next+=length;
 363
 364 #if 0
 365   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 366 #endif
 367
 368   return utext_openUTF8(ut, (const char*)buf, length, status);
 369 #endif
 370 }
 371
 372
 373 //---------------------------------------------------------------------------
 374 //
 375 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 376 //                       for the LookingAt() and  Match() functions.
 377 //
 378 //       usage:
 379 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 380 //
 381 //          The expected results are UBool - TRUE or FALSE.
 382 //          The input text is unescaped.  The pattern is not.
 383 //
 384 //
 385 //---------------------------------------------------------------------------
 386
 387 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 388
 389 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 390     const UnicodeString pattern(pat, -1, US_INV);
 391     const UnicodeString inputText(text, -1, US_INV);
 392     UErrorCode          status  = U_ZERO_ERROR;
 393     UParseError         pe;
 394     RegexPattern        *REPattern = NULL;
 395     RegexMatcher        *REMatcher = NULL;
 396     UBool               retVal     = TRUE;
 397
 398     UnicodeString patString(pat, -1, US_INV);
 399     REPattern = RegexPattern::compile(patString, 0, pe, status);
 400     if (U_FAILURE(status)) {
 401         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 402             line, u_errorName(status));
 403         return FALSE;
 404     }
 405     if (line==376) { REPattern->dumpPattern();}
 406
 407     UnicodeString inputString(inputText);
 408     UnicodeString unEscapedInput = inputString.unescape();
 409     REMatcher = REPattern->matcher(unEscapedInput, status);
 410     if (U_FAILURE(status)) {
 411         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 412             line, u_errorName(status));
 413         return FALSE;
 414     }
 415
 416     UBool actualmatch;
 417     actualmatch = REMatcher->lookingAt(status);
 418     if (U_FAILURE(status)) {
 419         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 420             line, u_errorName(status));
 421         retVal =  FALSE;
 422     }
 423     if (actualmatch != looking) {
 424         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 425         retVal = FALSE;
 426     }
 427
 428     status = U_ZERO_ERROR;
 429     actualmatch = REMatcher->matches(status);
 430     if (U_FAILURE(status)) {
 431         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 432             line, u_errorName(status));
 433         retVal = FALSE;
 434     }
 435     if (actualmatch != match) {
 436         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 437         retVal = FALSE;
 438     }
 439
 440     if (retVal == FALSE) {
 441         REPattern->dumpPattern();
 442     }
 443
 444     delete REPattern;
 445     delete REMatcher;
 446     return retVal;
 447 }
 448
 449
 450 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 451     UText               pattern    = UTEXT_INITIALIZER;
 452     int32_t             inputUTF8Length;
 453     char                *textChars = NULL;
 454     UText               inputText  = UTEXT_INITIALIZER;
 455     UErrorCode          status     = U_ZERO_ERROR;
 456     UParseError         pe;
 457     RegexPattern        *REPattern = NULL;
 458     RegexMatcher        *REMatcher = NULL;
 459     UBool               retVal     = TRUE;
 460
 461     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 462     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 463     if (U_FAILURE(status)) {
 464         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 465             line, u_errorName(status));
 466         return FALSE;
 467     }
 468
 469     UnicodeString inputString(text, -1, US_INV);
 470     UnicodeString unEscapedInput = inputString.unescape();
 471     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 472     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 473
 474     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 475     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 476         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 477         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 478         return TRUE; // not a failure of the Regex engine
 479     }
 480     status = U_ZERO_ERROR; // buffer overflow
 481     textChars = new char[inputUTF8Length+1];
 482     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 483     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 484
 485     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 486     if (U_FAILURE(status)) {
 487         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 488             line, u_errorName(status));
 489         return FALSE;
 490     }
 491
 492     UBool actualmatch;
 493     actualmatch = REMatcher->lookingAt(status);
 494     if (U_FAILURE(status)) {
 495         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 496             line, u_errorName(status));
 497         retVal =  FALSE;
 498     }
 499     if (actualmatch != looking) {
 500         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 501         retVal = FALSE;
 502     }
 503
 504     status = U_ZERO_ERROR;
 505     actualmatch = REMatcher->matches(status);
 506     if (U_FAILURE(status)) {
 507         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 508             line, u_errorName(status));
 509         retVal = FALSE;
 510     }
 511     if (actualmatch != match) {
 512         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 513         retVal = FALSE;
 514     }
 515
 516     if (retVal == FALSE) {
 517         REPattern->dumpPattern();
 518     }
 519
 520     delete REPattern;
 521     delete REMatcher;
 522     utext_close(&inputText);
 523     utext_close(&pattern);
 524     delete[] textChars;
 525     return retVal;
 526 }
 527
 528
 529
 530 //---------------------------------------------------------------------------
 531 //
 532 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 533 //                       regex tests for incorrect patterns
 534 //
 535 //       usage:
 536 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 537 //
 538 //---------------------------------------------------------------------------
 539 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 540
 541 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 542                           UErrorCode expectedStatus, int32_t line) {
 543     UnicodeString       pattern(pat);
 544
 545     UErrorCode          status         = U_ZERO_ERROR;
 546     UParseError         pe;
 547     RegexPattern        *callerPattern = NULL;
 548
 549     //
 550     //  Compile the caller's pattern
 551     //
 552     UnicodeString patString(pat);
 553     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 554     if (status != expectedStatus) {
 555         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 556     } else {
 557         if (status != U_ZERO_ERROR) {
 558             if (pe.line != errLine || pe.offset != errCol) {
 559                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 560                     line, errLine, errCol, pe.line, pe.offset);
 561             }
 562         }
 563     }
 564
 565     delete callerPattern;
 566
 567     //
 568     //  Compile again, using a UTF-8-based UText
 569     //
 570     UText patternText = UTEXT_INITIALIZER;
 571     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 572     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 573     if (status != expectedStatus) {
 574         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 575     } else {
 576         if (status != U_ZERO_ERROR) {
 577             if (pe.line != errLine || pe.offset != errCol) {
 578                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 579                     line, errLine, errCol, pe.line, pe.offset);
 580             }
 581         }
 582     }
 583
 584     delete callerPattern;
 585     utext_close(&patternText);
 586 }
 587
 588
 589
 590 //---------------------------------------------------------------------------
 591 //
 592 //      Basic      Check for basic functionality of regex pattern matching.
 593 //                 Avoid the use of REGEX_FIND test macro, which has
 594 //                 substantial dependencies on basic Regex functionality.
 595 //
 596 //---------------------------------------------------------------------------
 597 void RegexTest::Basic() {
 598
 599
 600 //
 601 // Debug - slide failing test cases early
 602 //
 603 #if 0
 604     {
 605         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 606         UParseError pe;
 607         UErrorCode  status = U_ZERO_ERROR;
 608         RegexPattern *pattern;
 609         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 610         pattern->dumpPattern();
 611         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 612         UBool result = m->find();
 613         printf("result = %d\n", result);
 614         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 615         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 616     }
 617     exit(1);
 618 #endif
 619
 620
 621     //
 622     // Pattern with parentheses
 623     //
 624     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 625     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 626     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 627
 628     //
 629     // Patterns with *
 630     //
 631     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 632     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 633     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 634     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 635     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 636
 637     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 638     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 639
 640
 641     //
 642     //  Patterns with "."
 643     //
 644     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 645     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 646     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 647     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 648     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 649     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 650     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 651     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 652
 653     //
 654     //  Patterns with * applied to chars at end of literal string
 655     //
 656     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 657     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 658
 659     //
 660     //  Supplemental chars match as single chars, not a pair of surrogates.
 661     //
 662     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 663     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 664     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 665
 666
 667     //
 668     //  UnicodeSets in the pattern
 669     //
 670     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 671     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 672     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 673     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 674     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 675     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 676
 677     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 678     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 679     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 680     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 681     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 682
 683     //
 684     //   OR operator in patterns
 685     //
 686     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 687     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 688     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 689     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 690
 691     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 692     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 693     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 694     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 695     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 696     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 697
 698     //
 699     //  +
 700     //
 701     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 702     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 703     REGEX_TESTLM("b+", "", FALSE, FALSE);
 704     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 705     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 706     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 707
 708     //
 709     //   ?
 710     //
 711     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 712     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 713     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 714     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 715     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 716     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 717     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 718     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 719     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 720
 721     //
 722     //  Escape sequences that become single literal chars, handled internally
 723     //   by ICU's Unescape.
 724     //
 725
 726     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 727     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 728     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 729     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 730     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 731     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 732     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 733     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 734     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 735     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 736
 737     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 738     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 739
 740     // Escape of special chars in patterns
 741     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 742 }
 743
 744
 745 //---------------------------------------------------------------------------
 746 //
 747 //    UTextBasic   Check for quirks that are specific to the UText
 748 //                 implementation.
 749 //
 750 //---------------------------------------------------------------------------
 751 void RegexTest::UTextBasic() {
 752     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 753     UErrorCode status = U_ZERO_ERROR;
 754     UText pattern = UTEXT_INITIALIZER;
 755     utext_openUTF8(&pattern, str_abc, -1, &status);
 756     RegexMatcher matcher(&pattern, 0, status);
 757     REGEX_CHECK_STATUS;
 758
 759     UText input = UTEXT_INITIALIZER;
 760     utext_openUTF8(&input, str_abc, -1, &status);
 761     REGEX_CHECK_STATUS;
 762     matcher.reset(&input);
 763     REGEX_CHECK_STATUS;
 764     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 765
 766     matcher.reset(matcher.inputText());
 767     REGEX_CHECK_STATUS;
 768     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 769
 770     utext_close(&pattern);
 771     utext_close(&input);
 772 }
 773
 774
 775 //---------------------------------------------------------------------------
 776 //
 777 //      API_Match   Test that the API for class RegexMatcher
 778 //                  is present and nominally working, but excluding functions
 779 //                  implementing replace operations.
 780 //
 781 //---------------------------------------------------------------------------
 782 void RegexTest::API_Match() {
 783     UParseError         pe;
 784     UErrorCode          status=U_ZERO_ERROR;
 785     int32_t             flags = 0;
 786
 787     //
 788     // Debug - slide failing test cases early
 789     //
 790 #if 0
 791     {
 792     }
 793     return;
 794 #endif
 795
 796     //
 797     // Simple pattern compilation
 798     //
 799     {
 800         UnicodeString       re("abc");
 801         RegexPattern        *pat2;
 802         pat2 = RegexPattern::compile(re, flags, pe, status);
 803         REGEX_CHECK_STATUS;
 804
 805         UnicodeString inStr1 = "abcdef this is a test";
 806         UnicodeString instr2 = "not abc";
 807         UnicodeString empty  = "";
 808
 809
 810         //
 811         // Matcher creation and reset.
 812         //
 813         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 814         REGEX_CHECK_STATUS;
 815         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 816         REGEX_ASSERT(m1->input() == inStr1);
 817         m1->reset(instr2);
 818         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 819         REGEX_ASSERT(m1->input() == instr2);
 820         m1->reset(inStr1);
 821         REGEX_ASSERT(m1->input() == inStr1);
 822         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 823         m1->reset(empty);
 824         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 825         REGEX_ASSERT(m1->input() == empty);
 826         REGEX_ASSERT(&m1->pattern() == pat2);
 827
 828         //
 829         //  reset(pos, status)
 830         //
 831         m1->reset(inStr1);
 832         m1->reset(4, status);
 833         REGEX_CHECK_STATUS;
 834         REGEX_ASSERT(m1->input() == inStr1);
 835         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 836
 837         m1->reset(-1, status);
 838         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 839         status = U_ZERO_ERROR;
 840
 841         m1->reset(0, status);
 842         REGEX_CHECK_STATUS;
 843         status = U_ZERO_ERROR;
 844
 845         int32_t len = m1->input().length();
 846         m1->reset(len-1, status);
 847         REGEX_CHECK_STATUS;
 848         status = U_ZERO_ERROR;
 849
 850         m1->reset(len, status);
 851         REGEX_CHECK_STATUS;
 852         status = U_ZERO_ERROR;
 853
 854         m1->reset(len+1, status);
 855         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 856         status = U_ZERO_ERROR;
 857
 858         //
 859         // match(pos, status)
 860         //
 861         m1->reset(instr2);
 862         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 863         m1->reset();
 864         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 865         m1->reset();
 866         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 867         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 868         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 869         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 870
 871         // Match() at end of string should fail, but should not
 872         //  be an error.
 873         status = U_ZERO_ERROR;
 874         len = m1->input().length();
 875         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 876         REGEX_CHECK_STATUS;
 877
 878         // Match beyond end of string should fail with an error.
 879         status = U_ZERO_ERROR;
 880         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 881         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 882
 883         // Successful match at end of string.
 884         {
 885             status = U_ZERO_ERROR;
 886             RegexMatcher m("A?", 0, status);  // will match zero length string.
 887             REGEX_CHECK_STATUS;
 888             m.reset(inStr1);
 889             len = inStr1.length();
 890             REGEX_ASSERT(m.matches(len, status) == TRUE);
 891             REGEX_CHECK_STATUS;
 892             m.reset(empty);
 893             REGEX_ASSERT(m.matches(0, status) == TRUE);
 894             REGEX_CHECK_STATUS;
 895         }
 896
 897
 898         //
 899         // lookingAt(pos, status)
 900         //
 901         status = U_ZERO_ERROR;
 902         m1->reset(instr2);  // "not abc"
 903         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 904         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 905         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 906         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 907         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 908         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 909         status = U_ZERO_ERROR;
 910         len = m1->input().length();
 911         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 912         REGEX_CHECK_STATUS;
 913         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 914         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 915
 916         delete m1;
 917         delete pat2;
 918     }
 919
 920
 921     //
 922     // Capture Group.
 923     //     RegexMatcher::start();
 924     //     RegexMatcher::end();
 925     //     RegexMatcher::groupCount();
 926     //
 927     {
 928         int32_t             flags=0;
 929         UParseError         pe;
 930         UErrorCode          status=U_ZERO_ERROR;
 931
 932         UnicodeString       re("01(23(45)67)(.*)");
 933         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 934         REGEX_CHECK_STATUS;
 935         UnicodeString data = "0123456789";
 936
 937         RegexMatcher *matcher = pat->matcher(data, status);
 938         REGEX_CHECK_STATUS;
 939         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 940         static const int32_t matchStarts[] = {0,  2, 4, 8};
 941         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 942         int32_t i;
 943         for (i=0; i<4; i++) {
 944             int32_t actualStart = matcher->start(i, status);
 945             REGEX_CHECK_STATUS;
 946             if (actualStart != matchStarts[i]) {
 947                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 948                     __LINE__, i, matchStarts[i], actualStart);
 949             }
 950             int32_t actualEnd = matcher->end(i, status);
 951             REGEX_CHECK_STATUS;
 952             if (actualEnd != matchEnds[i]) {
 953                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 954                     __LINE__, i, matchEnds[i], actualEnd);
 955             }
 956         }
 957
 958         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 959         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 960
 961         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 962         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 963         matcher->reset();
 964         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 965
 966         matcher->lookingAt(status);
 967         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 968         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 969         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 970         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 971         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 972         REGEX_CHECK_STATUS;
 973         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 974         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 975         matcher->reset();
 976         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 977
 978         delete matcher;
 979         delete pat;
 980
 981     }
 982
 983     //
 984     //  find
 985     //
 986     {
 987         int32_t             flags=0;
 988         UParseError         pe;
 989         UErrorCode          status=U_ZERO_ERROR;
 990
 991         UnicodeString       re("abc");
 992         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 993         REGEX_CHECK_STATUS;
 994         UnicodeString data = ".abc..abc...abc..";
 995         //                    012345678901234567
 996
 997         RegexMatcher *matcher = pat->matcher(data, status);
 998         REGEX_CHECK_STATUS;
 999         REGEX_ASSERT(matcher->find());
1000         REGEX_ASSERT(matcher->start(status) == 1);
1001         REGEX_ASSERT(matcher->find());
1002         REGEX_ASSERT(matcher->start(status) == 6);
1003         REGEX_ASSERT(matcher->find());
1004         REGEX_ASSERT(matcher->start(status) == 12);
1005         REGEX_ASSERT(matcher->find() == FALSE);
1006         REGEX_ASSERT(matcher->find() == FALSE);
1007
1008         matcher->reset();
1009         REGEX_ASSERT(matcher->find());
1010         REGEX_ASSERT(matcher->start(status) == 1);
1011
1012         REGEX_ASSERT(matcher->find(0, status));
1013         REGEX_ASSERT(matcher->start(status) == 1);
1014         REGEX_ASSERT(matcher->find(1, status));
1015         REGEX_ASSERT(matcher->start(status) == 1);
1016         REGEX_ASSERT(matcher->find(2, status));
1017         REGEX_ASSERT(matcher->start(status) == 6);
1018         REGEX_ASSERT(matcher->find(12, status));
1019         REGEX_ASSERT(matcher->start(status) == 12);
1020         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1021         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1022         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1023         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1024
1025         status = U_ZERO_ERROR;
1026         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1027         status = U_ZERO_ERROR;
1028         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1029
1030         REGEX_ASSERT(matcher->groupCount() == 0);
1031
1032         delete matcher;
1033         delete pat;
1034     }
1035
1036
1037     //
1038     //  find, with \G in pattern (true if at the end of a previous match).
1039     //
1040     {
1041         int32_t             flags=0;
1042         UParseError         pe;
1043         UErrorCode          status=U_ZERO_ERROR;
1044
1045         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1046         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1047         REGEX_CHECK_STATUS;
1048         UnicodeString data = ".abcabc.abc..";
1049         //                    012345678901234567
1050
1051         RegexMatcher *matcher = pat->matcher(data, status);
1052         REGEX_CHECK_STATUS;
1053         REGEX_ASSERT(matcher->find());
1054         REGEX_ASSERT(matcher->start(status) == 0);
1055         REGEX_ASSERT(matcher->start(1, status) == -1);
1056         REGEX_ASSERT(matcher->start(2, status) == 1);
1057
1058         REGEX_ASSERT(matcher->find());
1059         REGEX_ASSERT(matcher->start(status) == 4);
1060         REGEX_ASSERT(matcher->start(1, status) == 4);
1061         REGEX_ASSERT(matcher->start(2, status) == -1);
1062         REGEX_CHECK_STATUS;
1063
1064         delete matcher;
1065         delete pat;
1066     }
1067
1068     //
1069     //   find with zero length matches, match position should bump ahead
1070     //     to prevent loops.
1071     //
1072     {
1073         int32_t                 i;
1074         UErrorCode          status=U_ZERO_ERROR;
1075         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1076                                                       //   using an always-true look-ahead.
1077         REGEX_CHECK_STATUS;
1078         UnicodeString s("    ");
1079         m.reset(s);
1080         for (i=0; ; i++) {
1081             if (m.find() == FALSE) {
1082                 break;
1083             }
1084             REGEX_ASSERT(m.start(status) == i);
1085             REGEX_ASSERT(m.end(status) == i);
1086         }
1087         REGEX_ASSERT(i==5);
1088
1089         // Check that the bump goes over surrogate pairs OK
1090         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1091         s = s.unescape();
1092         m.reset(s);
1093         for (i=0; ; i+=2) {
1094             if (m.find() == FALSE) {
1095                 break;
1096             }
1097             REGEX_ASSERT(m.start(status) == i);
1098             REGEX_ASSERT(m.end(status) == i);
1099         }
1100         REGEX_ASSERT(i==10);
1101     }
1102     {
1103         // find() loop breaking test.
1104         //        with pattern of /.?/, should see a series of one char matches, then a single
1105         //        match of zero length at the end of the input string.
1106         int32_t                 i;
1107         UErrorCode          status=U_ZERO_ERROR;
1108         RegexMatcher        m(".?", 0, status);
1109         REGEX_CHECK_STATUS;
1110         UnicodeString s("    ");
1111         m.reset(s);
1112         for (i=0; ; i++) {
1113             if (m.find() == FALSE) {
1114                 break;
1115             }
1116             REGEX_ASSERT(m.start(status) == i);
1117             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1118         }
1119         REGEX_ASSERT(i==5);
1120     }
1121
1122
1123     //
1124     // Matchers with no input string behave as if they had an empty input string.
1125     //
1126
1127     {
1128         UErrorCode status = U_ZERO_ERROR;
1129         RegexMatcher  m(".?", 0, status);
1130         REGEX_CHECK_STATUS;
1131         REGEX_ASSERT(m.find());
1132         REGEX_ASSERT(m.start(status) == 0);
1133         REGEX_ASSERT(m.input() == "");
1134     }
1135     {
1136         UErrorCode status = U_ZERO_ERROR;
1137         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1138         RegexMatcher  *m = p->matcher(status);
1139         REGEX_CHECK_STATUS;
1140
1141         REGEX_ASSERT(m->find() == FALSE);
1142         REGEX_ASSERT(m->input() == "");
1143         delete m;
1144         delete p;
1145     }
1146
1147     //
1148     // Regions
1149     //
1150     {
1151         UErrorCode status = U_ZERO_ERROR;
1152         UnicodeString testString("This is test data");
1153         RegexMatcher m(".*", testString,  0, status);
1154         REGEX_CHECK_STATUS;
1155         REGEX_ASSERT(m.regionStart() == 0);
1156         REGEX_ASSERT(m.regionEnd() == testString.length());
1157         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1158         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1159
1160         m.region(2,4, status);
1161         REGEX_CHECK_STATUS;
1162         REGEX_ASSERT(m.matches(status));
1163         REGEX_ASSERT(m.start(status)==2);
1164         REGEX_ASSERT(m.end(status)==4);
1165         REGEX_CHECK_STATUS;
1166
1167         m.reset();
1168         REGEX_ASSERT(m.regionStart() == 0);
1169         REGEX_ASSERT(m.regionEnd() == testString.length());
1170
1171         UnicodeString shorterString("short");
1172         m.reset(shorterString);
1173         REGEX_ASSERT(m.regionStart() == 0);
1174         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1175
1176         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1177         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1178         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1179         REGEX_ASSERT(&m == &m.reset());
1180         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1181
1182         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1183         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1184         REGEX_ASSERT(&m == &m.reset());
1185         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1186
1187         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1188         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1189         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1190         REGEX_ASSERT(&m == &m.reset());
1191         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1192
1193         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1194         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1195         REGEX_ASSERT(&m == &m.reset());
1196         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1197
1198     }
1199
1200     //
1201     // hitEnd() and requireEnd()
1202     //
1203     {
1204         UErrorCode status = U_ZERO_ERROR;
1205         UnicodeString testString("aabb");
1206         RegexMatcher m1(".*", testString,  0, status);
1207         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1208         REGEX_ASSERT(m1.hitEnd() == TRUE);
1209         REGEX_ASSERT(m1.requireEnd() == FALSE);
1210         REGEX_CHECK_STATUS;
1211
1212         status = U_ZERO_ERROR;
1213         RegexMatcher m2("a*", testString, 0, status);
1214         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1215         REGEX_ASSERT(m2.hitEnd() == FALSE);
1216         REGEX_ASSERT(m2.requireEnd() == FALSE);
1217         REGEX_CHECK_STATUS;
1218
1219         status = U_ZERO_ERROR;
1220         RegexMatcher m3(".*$", testString, 0, status);
1221         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1222         REGEX_ASSERT(m3.hitEnd() == TRUE);
1223         REGEX_ASSERT(m3.requireEnd() == TRUE);
1224         REGEX_CHECK_STATUS;
1225     }
1226
1227
1228     //
1229     // Compilation error on reset with UChar *
1230     //   These were a hazard that people were stumbling over with runtime errors.
1231     //   Changed them to compiler errors by adding private methods that more closely
1232     //   matched the incorrect use of the functions.
1233     //
1234 #if 0
1235     {
1236         UErrorCode status = U_ZERO_ERROR;
1237         UChar ucharString[20];
1238         RegexMatcher m(".", 0, status);
1239         m.reset(ucharString);  // should not compile.
1240
1241         RegexPattern *p = RegexPattern::compile(".", 0, status);
1242         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1243
1244         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1245     }
1246 #endif
1247
1248     //
1249     //  Time Outs.
1250     //       Note:  These tests will need to be changed when the regexp engine is
1251     //              able to detect and cut short the exponential time behavior on
1252     //              this type of match.
1253     //
1254     {
1255         UErrorCode status = U_ZERO_ERROR;
1256         //    Enough 'a's in the string to cause the match to time out.
1257         //       (Each on additonal 'a' doubles the time)
1258         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1259         RegexMatcher matcher("(a+)+b", testString, 0, status);
1260         REGEX_CHECK_STATUS;
1261         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1262         matcher.setTimeLimit(100, status);
1263         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1264         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1265         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1266     }
1267     {
1268         UErrorCode status = U_ZERO_ERROR;
1269         //   Few enough 'a's to slip in under the time limit.
1270         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1271         RegexMatcher matcher("(a+)+b", testString, 0, status);
1272         REGEX_CHECK_STATUS;
1273         matcher.setTimeLimit(100, status);
1274         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1275         REGEX_CHECK_STATUS;
1276     }
1277
1278     //
1279     //  Stack Limits
1280     //
1281     {
1282         UErrorCode status = U_ZERO_ERROR;
1283         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1284
1285         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1286         //   of the '+', and makes the stack frames larger.
1287         RegexMatcher matcher("(A)+A$", testString, 0, status);
1288
1289         // With the default stack, this match should fail to run
1290         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1291         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1292
1293         // With unlimited stack, it should run
1294         status = U_ZERO_ERROR;
1295         matcher.setStackLimit(0, status);
1296         REGEX_CHECK_STATUS;
1297         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1298         REGEX_CHECK_STATUS;
1299         REGEX_ASSERT(matcher.getStackLimit() == 0);
1300
1301         // With a limited stack, it the match should fail
1302         status = U_ZERO_ERROR;
1303         matcher.setStackLimit(10000, status);
1304         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1305         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1306         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1307     }
1308
1309         // A pattern that doesn't save state should work with
1310         //   a minimal sized stack
1311     {
1312         UErrorCode status = U_ZERO_ERROR;
1313         UnicodeString testString = "abc";
1314         RegexMatcher matcher("abc", testString, 0, status);
1315         REGEX_CHECK_STATUS;
1316         matcher.setStackLimit(30, status);
1317         REGEX_CHECK_STATUS;
1318         REGEX_ASSERT(matcher.matches(status) == TRUE);
1319         REGEX_CHECK_STATUS;
1320         REGEX_ASSERT(matcher.getStackLimit() == 30);
1321
1322         // Negative stack sizes should fail
1323         status = U_ZERO_ERROR;
1324         matcher.setStackLimit(1000, status);
1325         REGEX_CHECK_STATUS;
1326         matcher.setStackLimit(-1, status);
1327         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1328         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1329     }
1330
1331
1332 }
1333
1334
1335
1336
1337
1338
1339 //---------------------------------------------------------------------------
1340 //
1341 //      API_Replace        API test for class RegexMatcher, testing the
1342 //                         Replace family of functions.
1343 //
1344 //---------------------------------------------------------------------------
1345 void RegexTest::API_Replace() {
1346     //
1347     //  Replace
1348     //
1349     int32_t             flags=0;
1350     UParseError         pe;
1351     UErrorCode          status=U_ZERO_ERROR;
1352
1353     UnicodeString       re("abc");
1354     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1355     REGEX_CHECK_STATUS;
1356     UnicodeString data = ".abc..abc...abc..";
1357     //                    012345678901234567
1358     RegexMatcher *matcher = pat->matcher(data, status);
1359
1360     //
1361     //  Plain vanilla matches.
1362     //
1363     UnicodeString  dest;
1364     dest = matcher->replaceFirst("yz", status);
1365     REGEX_CHECK_STATUS;
1366     REGEX_ASSERT(dest == ".yz..abc...abc..");
1367
1368     dest = matcher->replaceAll("yz", status);
1369     REGEX_CHECK_STATUS;
1370     REGEX_ASSERT(dest == ".yz..yz...yz..");
1371
1372     //
1373     //  Plain vanilla non-matches.
1374     //
1375     UnicodeString d2 = ".abx..abx...abx..";
1376     matcher->reset(d2);
1377     dest = matcher->replaceFirst("yz", status);
1378     REGEX_CHECK_STATUS;
1379     REGEX_ASSERT(dest == ".abx..abx...abx..");
1380
1381     dest = matcher->replaceAll("yz", status);
1382     REGEX_CHECK_STATUS;
1383     REGEX_ASSERT(dest == ".abx..abx...abx..");
1384
1385     //
1386     // Empty source string
1387     //
1388     UnicodeString d3 = "";
1389     matcher->reset(d3);
1390     dest = matcher->replaceFirst("yz", status);
1391     REGEX_CHECK_STATUS;
1392     REGEX_ASSERT(dest == "");
1393
1394     dest = matcher->replaceAll("yz", status);
1395     REGEX_CHECK_STATUS;
1396     REGEX_ASSERT(dest == "");
1397
1398     //
1399     // Empty substitution string
1400     //
1401     matcher->reset(data);              // ".abc..abc...abc.."
1402     dest = matcher->replaceFirst("", status);
1403     REGEX_CHECK_STATUS;
1404     REGEX_ASSERT(dest == "...abc...abc..");
1405
1406     dest = matcher->replaceAll("", status);
1407     REGEX_CHECK_STATUS;
1408     REGEX_ASSERT(dest == "........");
1409
1410     //
1411     // match whole string
1412     //
1413     UnicodeString d4 = "abc";
1414     matcher->reset(d4);
1415     dest = matcher->replaceFirst("xyz", status);
1416     REGEX_CHECK_STATUS;
1417     REGEX_ASSERT(dest == "xyz");
1418
1419     dest = matcher->replaceAll("xyz", status);
1420     REGEX_CHECK_STATUS;
1421     REGEX_ASSERT(dest == "xyz");
1422
1423     //
1424     // Capture Group, simple case
1425     //
1426     UnicodeString       re2("a(..)");
1427     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1428     REGEX_CHECK_STATUS;
1429     UnicodeString d5 = "abcdefg";
1430     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1431     REGEX_CHECK_STATUS;
1432     dest = matcher2->replaceFirst("$1$1", status);
1433     REGEX_CHECK_STATUS;
1434     REGEX_ASSERT(dest == "bcbcdefg");
1435
1436     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1437     REGEX_CHECK_STATUS;
1438     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1439
1440     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1441     REGEX_ASSERT(U_FAILURE(status));
1442     status = U_ZERO_ERROR;
1443
1444     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1445     replacement = replacement.unescape();
1446     dest = matcher2->replaceFirst(replacement, status);
1447     REGEX_CHECK_STATUS;
1448     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1449
1450     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1451
1452
1453     //
1454     // Replacement String with \u hex escapes
1455     //
1456     {
1457         UnicodeString  src = "abc 1 abc 2 abc 3";
1458         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1459         matcher->reset(src);
1460         UnicodeString  result = matcher->replaceAll(substitute, status);
1461         REGEX_CHECK_STATUS;
1462         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1463     }
1464     {
1465         UnicodeString  src = "abc !";
1466         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1467         matcher->reset(src);
1468         UnicodeString  result = matcher->replaceAll(substitute, status);
1469         REGEX_CHECK_STATUS;
1470         UnicodeString expected = UnicodeString("--");
1471         expected.append((UChar32)0x10000);
1472         expected.append("-- !");
1473         REGEX_ASSERT(result == expected);
1474     }
1475     // TODO:  need more through testing of capture substitutions.
1476
1477     // Bug 4057
1478     //
1479     {
1480         status = U_ZERO_ERROR;
1481         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1482         RegexMatcher m("ss(.*?)ee", 0, status);
1483         REGEX_CHECK_STATUS;
1484         UnicodeString result;
1485
1486         // Multiple finds do NOT bump up the previous appendReplacement postion.
1487         m.reset(s);
1488         m.find();
1489         m.find();
1490         m.appendReplacement(result, "ooh", status);
1491         REGEX_CHECK_STATUS;
1492         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1493
1494         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1495         status = U_ZERO_ERROR;
1496         result.truncate(0);
1497         m.reset(10, status);
1498         m.find();
1499         m.find();
1500         m.appendReplacement(result, "ooh", status);
1501         REGEX_CHECK_STATUS;
1502         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1503
1504         // find() at interior of string, appendReplacemnt still starts at beginning.
1505         status = U_ZERO_ERROR;
1506         result.truncate(0);
1507         m.reset();
1508         m.find(10, status);
1509         m.find();
1510         m.appendReplacement(result, "ooh", status);
1511         REGEX_CHECK_STATUS;
1512         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1513
1514         m.appendTail(result);
1515         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1516
1517     }
1518
1519     delete matcher2;
1520     delete pat2;
1521     delete matcher;
1522     delete pat;
1523 }
1524
1525
1526 //---------------------------------------------------------------------------
1527 //
1528 //      API_Pattern       Test that the API for class RegexPattern is
1529 //                        present and nominally working.
1530 //
1531 //---------------------------------------------------------------------------
1532 void RegexTest::API_Pattern() {
1533     RegexPattern        pata;    // Test default constructor to not crash.
1534     RegexPattern        patb;
1535
1536     REGEX_ASSERT(pata == patb);
1537     REGEX_ASSERT(pata == pata);
1538
1539     UnicodeString re1("abc[a-l][m-z]");
1540     UnicodeString re2("def");
1541     UErrorCode    status = U_ZERO_ERROR;
1542     UParseError   pe;
1543
1544     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1545     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1546     REGEX_CHECK_STATUS;
1547     REGEX_ASSERT(*pat1 == *pat1);
1548     REGEX_ASSERT(*pat1 != pata);
1549
1550     // Assign
1551     patb = *pat1;
1552     REGEX_ASSERT(patb == *pat1);
1553
1554     // Copy Construct
1555     RegexPattern patc(*pat1);
1556     REGEX_ASSERT(patc == *pat1);
1557     REGEX_ASSERT(patb == patc);
1558     REGEX_ASSERT(pat1 != pat2);
1559     patb = *pat2;
1560     REGEX_ASSERT(patb != patc);
1561     REGEX_ASSERT(patb == *pat2);
1562
1563     // Compile with no flags.
1564     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1565     REGEX_ASSERT(*pat1a == *pat1);
1566
1567     REGEX_ASSERT(pat1a->flags() == 0);
1568
1569     // Compile with different flags should be not equal
1570     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1571     REGEX_CHECK_STATUS;
1572
1573     REGEX_ASSERT(*pat1b != *pat1a);
1574     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1575     REGEX_ASSERT(pat1a->flags() == 0);
1576     delete pat1b;
1577
1578     // clone
1579     RegexPattern *pat1c = pat1->clone();
1580     REGEX_ASSERT(*pat1c == *pat1);
1581     REGEX_ASSERT(*pat1c != *pat2);
1582
1583     delete pat1c;
1584     delete pat1a;
1585     delete pat1;
1586     delete pat2;
1587
1588
1589     //
1590     //   Verify that a matcher created from a cloned pattern works.
1591     //     (Jitterbug 3423)
1592     //
1593     {
1594         UErrorCode     status     = U_ZERO_ERROR;
1595         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1596         RegexPattern  *pClone     = pSource->clone();
1597         delete         pSource;
1598         RegexMatcher  *mFromClone = pClone->matcher(status);
1599         REGEX_CHECK_STATUS;
1600         UnicodeString s = "Hello World";
1601         mFromClone->reset(s);
1602         REGEX_ASSERT(mFromClone->find() == TRUE);
1603         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1604         REGEX_ASSERT(mFromClone->find() == TRUE);
1605         REGEX_ASSERT(mFromClone->group(status) == "World");
1606         REGEX_ASSERT(mFromClone->find() == FALSE);
1607         delete mFromClone;
1608         delete pClone;
1609     }
1610
1611     //
1612     //   matches convenience API
1613     //
1614     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1615     REGEX_CHECK_STATUS;
1616     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1617     REGEX_CHECK_STATUS;
1618     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1619     REGEX_CHECK_STATUS;
1620     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1621     REGEX_CHECK_STATUS;
1622     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1623     REGEX_CHECK_STATUS;
1624     status = U_INDEX_OUTOFBOUNDS_ERROR;
1625     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1626     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1627
1628
1629     //
1630     // Split()
1631     //
1632     status = U_ZERO_ERROR;
1633     pat1 = RegexPattern::compile(" +",  pe, status);
1634     REGEX_CHECK_STATUS;
1635     UnicodeString  fields[10];
1636
1637     int32_t n;
1638     n = pat1->split("Now is the time", fields, 10, status);
1639     REGEX_CHECK_STATUS;
1640     REGEX_ASSERT(n==4);
1641     REGEX_ASSERT(fields[0]=="Now");
1642     REGEX_ASSERT(fields[1]=="is");
1643     REGEX_ASSERT(fields[2]=="the");
1644     REGEX_ASSERT(fields[3]=="time");
1645     REGEX_ASSERT(fields[4]=="");
1646
1647     n = pat1->split("Now is the time", fields, 2, status);
1648     REGEX_CHECK_STATUS;
1649     REGEX_ASSERT(n==2);
1650     REGEX_ASSERT(fields[0]=="Now");
1651     REGEX_ASSERT(fields[1]=="is the time");
1652     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1653
1654     fields[1] = "*";
1655     status = U_ZERO_ERROR;
1656     n = pat1->split("Now is the time", fields, 1, status);
1657     REGEX_CHECK_STATUS;
1658     REGEX_ASSERT(n==1);
1659     REGEX_ASSERT(fields[0]=="Now is the time");
1660     REGEX_ASSERT(fields[1]=="*");
1661     status = U_ZERO_ERROR;
1662
1663     n = pat1->split("    Now       is the time   ", fields, 10, status);
1664     REGEX_CHECK_STATUS;
1665     REGEX_ASSERT(n==6);
1666     REGEX_ASSERT(fields[0]=="");
1667     REGEX_ASSERT(fields[1]=="Now");
1668     REGEX_ASSERT(fields[2]=="is");
1669     REGEX_ASSERT(fields[3]=="the");
1670     REGEX_ASSERT(fields[4]=="time");
1671     REGEX_ASSERT(fields[5]=="");
1672
1673     n = pat1->split("     ", fields, 10, status);
1674     REGEX_CHECK_STATUS;
1675     REGEX_ASSERT(n==2);
1676     REGEX_ASSERT(fields[0]=="");
1677     REGEX_ASSERT(fields[1]=="");
1678
1679     fields[0] = "foo";
1680     n = pat1->split("", fields, 10, status);
1681     REGEX_CHECK_STATUS;
1682     REGEX_ASSERT(n==0);
1683     REGEX_ASSERT(fields[0]=="foo");
1684
1685     delete pat1;
1686
1687     //  split, with a pattern with (capture)
1688     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1689     REGEX_CHECK_STATUS;
1690
1691     status = U_ZERO_ERROR;
1692     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1693     REGEX_CHECK_STATUS;
1694     REGEX_ASSERT(n==7);
1695     REGEX_ASSERT(fields[0]=="");
1696     REGEX_ASSERT(fields[1]=="a");
1697     REGEX_ASSERT(fields[2]=="Now is ");
1698     REGEX_ASSERT(fields[3]=="b");
1699     REGEX_ASSERT(fields[4]=="the time");
1700     REGEX_ASSERT(fields[5]=="c");
1701     REGEX_ASSERT(fields[6]=="");
1702     REGEX_ASSERT(status==U_ZERO_ERROR);
1703
1704     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1705     REGEX_CHECK_STATUS;
1706     REGEX_ASSERT(n==7);
1707     REGEX_ASSERT(fields[0]=="  ");
1708     REGEX_ASSERT(fields[1]=="a");
1709     REGEX_ASSERT(fields[2]=="Now is ");
1710     REGEX_ASSERT(fields[3]=="b");
1711     REGEX_ASSERT(fields[4]=="the time");
1712     REGEX_ASSERT(fields[5]=="c");
1713     REGEX_ASSERT(fields[6]=="");
1714
1715     status = U_ZERO_ERROR;
1716     fields[6] = "foo";
1717     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1718     REGEX_CHECK_STATUS;
1719     REGEX_ASSERT(n==6);
1720     REGEX_ASSERT(fields[0]=="  ");
1721     REGEX_ASSERT(fields[1]=="a");
1722     REGEX_ASSERT(fields[2]=="Now is ");
1723     REGEX_ASSERT(fields[3]=="b");
1724     REGEX_ASSERT(fields[4]=="the time");
1725     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1726     REGEX_ASSERT(fields[6]=="foo");
1727
1728     status = U_ZERO_ERROR;
1729     fields[5] = "foo";
1730     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1731     REGEX_CHECK_STATUS;
1732     REGEX_ASSERT(n==5);
1733     REGEX_ASSERT(fields[0]=="  ");
1734     REGEX_ASSERT(fields[1]=="a");
1735     REGEX_ASSERT(fields[2]=="Now is ");
1736     REGEX_ASSERT(fields[3]=="b");
1737     REGEX_ASSERT(fields[4]=="the time<c>");
1738     REGEX_ASSERT(fields[5]=="foo");
1739
1740     status = U_ZERO_ERROR;
1741     fields[5] = "foo";
1742     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1743     REGEX_CHECK_STATUS;
1744     REGEX_ASSERT(n==5);
1745     REGEX_ASSERT(fields[0]=="  ");
1746     REGEX_ASSERT(fields[1]=="a");
1747     REGEX_ASSERT(fields[2]=="Now is ");
1748     REGEX_ASSERT(fields[3]=="b");
1749     REGEX_ASSERT(fields[4]=="the time");
1750     REGEX_ASSERT(fields[5]=="foo");
1751
1752     status = U_ZERO_ERROR;
1753     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1754     REGEX_CHECK_STATUS;
1755     REGEX_ASSERT(n==4);
1756     REGEX_ASSERT(fields[0]=="  ");
1757     REGEX_ASSERT(fields[1]=="a");
1758     REGEX_ASSERT(fields[2]=="Now is ");
1759     REGEX_ASSERT(fields[3]=="the time<c>");
1760     status = U_ZERO_ERROR;
1761     delete pat1;
1762
1763     pat1 = RegexPattern::compile("([-,])",  pe, status);
1764     REGEX_CHECK_STATUS;
1765     n = pat1->split("1-10,20", fields, 10, status);
1766     REGEX_CHECK_STATUS;
1767     REGEX_ASSERT(n==5);
1768     REGEX_ASSERT(fields[0]=="1");
1769     REGEX_ASSERT(fields[1]=="-");
1770     REGEX_ASSERT(fields[2]=="10");
1771     REGEX_ASSERT(fields[3]==",");
1772     REGEX_ASSERT(fields[4]=="20");
1773     delete pat1;
1774
1775     // Test split of string with empty trailing fields
1776     pat1 = RegexPattern::compile(",", pe, status);
1777     REGEX_CHECK_STATUS;
1778     n = pat1->split("a,b,c,", fields, 10, status);
1779     REGEX_CHECK_STATUS;
1780     REGEX_ASSERT(n==4);
1781     REGEX_ASSERT(fields[0]=="a");
1782     REGEX_ASSERT(fields[1]=="b");
1783     REGEX_ASSERT(fields[2]=="c");
1784     REGEX_ASSERT(fields[3]=="");
1785
1786     n = pat1->split("a,,,", fields, 10, status);
1787     REGEX_CHECK_STATUS;
1788     REGEX_ASSERT(n==4);
1789     REGEX_ASSERT(fields[0]=="a");
1790     REGEX_ASSERT(fields[1]=="");
1791     REGEX_ASSERT(fields[2]=="");
1792     REGEX_ASSERT(fields[3]=="");
1793     delete pat1;
1794
1795     // Split Separator with zero length match.
1796     pat1 = RegexPattern::compile(":?", pe, status);
1797     REGEX_CHECK_STATUS;
1798     n = pat1->split("abc", fields, 10, status);
1799     REGEX_CHECK_STATUS;
1800     REGEX_ASSERT(n==5);
1801     REGEX_ASSERT(fields[0]=="");
1802     REGEX_ASSERT(fields[1]=="a");
1803     REGEX_ASSERT(fields[2]=="b");
1804     REGEX_ASSERT(fields[3]=="c");
1805     REGEX_ASSERT(fields[4]=="");
1806
1807     delete pat1;
1808
1809     //
1810     // RegexPattern::pattern()
1811     //
1812     pat1 = new RegexPattern();
1813     REGEX_ASSERT(pat1->pattern() == "");
1814     delete pat1;
1815
1816     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1817     REGEX_CHECK_STATUS;
1818     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1819     delete pat1;
1820
1821
1822     //
1823     // classID functions
1824     //
1825     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1826     REGEX_CHECK_STATUS;
1827     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1828     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1829     UnicodeString Hello("Hello, world.");
1830     RegexMatcher *m = pat1->matcher(Hello, status);
1831     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1832     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1833     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1834     delete m;
1835     delete pat1;
1836
1837 }
1838
1839 //---------------------------------------------------------------------------
1840 //
1841 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1842 //                       is present and working, but excluding functions
1843 //                       implementing replace operations.
1844 //
1845 //---------------------------------------------------------------------------
1846 void RegexTest::API_Match_UTF8() {
1847     UParseError         pe;
1848     UErrorCode          status=U_ZERO_ERROR;
1849     int32_t             flags = 0;
1850
1851     //
1852     // Debug - slide failing test cases early
1853     //
1854 #if 0
1855     {
1856     }
1857     return;
1858 #endif
1859
1860     //
1861     // Simple pattern compilation
1862     //
1863     {
1864         UText               re = UTEXT_INITIALIZER;
1865         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1866         REGEX_VERBOSE_TEXT(&re);
1867         RegexPattern        *pat2;
1868         pat2 = RegexPattern::compile(&re, flags, pe, status);
1869         REGEX_CHECK_STATUS;
1870
1871         UText input1 = UTEXT_INITIALIZER;
1872         UText input2 = UTEXT_INITIALIZER;
1873         UText empty  = UTEXT_INITIALIZER;
1874         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1875         REGEX_VERBOSE_TEXT(&input1);
1876         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1877         REGEX_VERBOSE_TEXT(&input2);
1878         utext_openUChars(&empty, NULL, 0, &status);
1879
1880         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1881         int32_t input2Len = strlen("not abc");
1882
1883
1884         //
1885         // Matcher creation and reset.
1886         //
1887         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1888         REGEX_CHECK_STATUS;
1889         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1890         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1891         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1892         m1->reset(&input2);
1893         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1894         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1895         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1896         m1->reset(&input1);
1897         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1898         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1899         m1->reset(&empty);
1900         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1901         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1902
1903         //
1904         //  reset(pos, status)
1905         //
1906         m1->reset(&input1);
1907         m1->reset(4, status);
1908         REGEX_CHECK_STATUS;
1909         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1910         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1911
1912         m1->reset(-1, status);
1913         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1914         status = U_ZERO_ERROR;
1915
1916         m1->reset(0, status);
1917         REGEX_CHECK_STATUS;
1918         status = U_ZERO_ERROR;
1919
1920         m1->reset(input1Len-1, status);
1921         REGEX_CHECK_STATUS;
1922         status = U_ZERO_ERROR;
1923
1924         m1->reset(input1Len, status);
1925         REGEX_CHECK_STATUS;
1926         status = U_ZERO_ERROR;
1927
1928         m1->reset(input1Len+1, status);
1929         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1930         status = U_ZERO_ERROR;
1931
1932         //
1933         // match(pos, status)
1934         //
1935         m1->reset(&input2);
1936         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1937         m1->reset();
1938         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1939         m1->reset();
1940         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1941         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1942         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1943         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1944
1945         // Match() at end of string should fail, but should not
1946         //  be an error.
1947         status = U_ZERO_ERROR;
1948         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1949         REGEX_CHECK_STATUS;
1950
1951         // Match beyond end of string should fail with an error.
1952         status = U_ZERO_ERROR;
1953         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1954         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1955
1956         // Successful match at end of string.
1957         {
1958             status = U_ZERO_ERROR;
1959             RegexMatcher m("A?", 0, status);  // will match zero length string.
1960             REGEX_CHECK_STATUS;
1961             m.reset(&input1);
1962             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1963             REGEX_CHECK_STATUS;
1964             m.reset(&empty);
1965             REGEX_ASSERT(m.matches(0, status) == TRUE);
1966             REGEX_CHECK_STATUS;
1967         }
1968
1969
1970         //
1971         // lookingAt(pos, status)
1972         //
1973         status = U_ZERO_ERROR;
1974         m1->reset(&input2);  // "not abc"
1975         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1976         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1977         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1978         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1980         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1981         status = U_ZERO_ERROR;
1982         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1983         REGEX_CHECK_STATUS;
1984         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1985         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986
1987         delete m1;
1988         delete pat2;
1989
1990         utext_close(&re);
1991         utext_close(&input1);
1992         utext_close(&input2);
1993         utext_close(&empty);
1994     }
1995
1996
1997     //
1998     // Capture Group.
1999     //     RegexMatcher::start();
2000     //     RegexMatcher::end();
2001     //     RegexMatcher::groupCount();
2002     //
2003     {
2004         int32_t             flags=0;
2005         UParseError         pe;
2006         UErrorCode          status=U_ZERO_ERROR;
2007         UText               re=UTEXT_INITIALIZER;
2008         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2009         utext_openUTF8(&re, str_01234567_pat, -1, &status);
2010
2011         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2012         REGEX_CHECK_STATUS;
2013
2014         UText input = UTEXT_INITIALIZER;
2015         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2016         utext_openUTF8(&input, str_0123456789, -1, &status);
2017
2018         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2019         REGEX_CHECK_STATUS;
2020         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2021         static const int32_t matchStarts[] = {0,  2, 4, 8};
2022         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2023         int32_t i;
2024         for (i=0; i<4; i++) {
2025             int32_t actualStart = matcher->start(i, status);
2026             REGEX_CHECK_STATUS;
2027             if (actualStart != matchStarts[i]) {
2028                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2029                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2030             }
2031             int32_t actualEnd = matcher->end(i, status);
2032             REGEX_CHECK_STATUS;
2033             if (actualEnd != matchEnds[i]) {
2034                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2035                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2036             }
2037         }
2038
2039         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2040         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2041
2042         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2043         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2044         matcher->reset();
2045         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2046
2047         matcher->lookingAt(status);
2048
2049         UnicodeString dest;
2050         UText destText = UTEXT_INITIALIZER;
2051         utext_openUnicodeString(&destText, &dest, &status);
2052         UText *result;
2053         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2054         //  Test shallow-clone API
2055         int64_t   group_len;
2056         result = matcher->group((UText *)NULL, group_len, status);
2057         REGEX_CHECK_STATUS;
2058         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2059         utext_close(result);
2060         result = matcher->group(0, &destText, group_len, status);
2061         REGEX_CHECK_STATUS;
2062         REGEX_ASSERT(result == &destText);
2063         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064         //  destText is now immutable, reopen it
2065         utext_close(&destText);
2066         utext_openUnicodeString(&destText, &dest, &status);
2067
2068         int64_t length;
2069         result = matcher->group(0, NULL, length, status);
2070         REGEX_CHECK_STATUS;
2071         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2072         utext_close(result);
2073         result = matcher->group(0, &destText, length, status);
2074         REGEX_CHECK_STATUS;
2075         REGEX_ASSERT(result == &destText);
2076         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2077         REGEX_ASSERT(length == 10);
2078         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079
2080         // Capture Group 1 == "234567"
2081         result = matcher->group(1, NULL, length, status);
2082         REGEX_CHECK_STATUS;
2083         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2084         REGEX_ASSERT(length == 6);
2085         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2086         utext_close(result);
2087
2088         result = matcher->group(1, &destText, length, status);
2089         REGEX_CHECK_STATUS;
2090         REGEX_ASSERT(result == &destText);
2091         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2092         REGEX_ASSERT(length == 6);
2093         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2094         utext_close(result);
2095
2096         // Capture Group 2 == "45"
2097         result = matcher->group(2, NULL, length, status);
2098         REGEX_CHECK_STATUS;
2099         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2100         REGEX_ASSERT(length == 2);
2101         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2102         utext_close(result);
2103
2104         result = matcher->group(2, &destText, length, status);
2105         REGEX_CHECK_STATUS;
2106         REGEX_ASSERT(result == &destText);
2107         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2108         REGEX_ASSERT(length == 2);
2109         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2110         utext_close(result);
2111
2112         // Capture Group 3 == "89"
2113         result = matcher->group(3, NULL, length, status);
2114         REGEX_CHECK_STATUS;
2115         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2116         REGEX_ASSERT(length == 2);
2117         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2118         utext_close(result);
2119
2120         result = matcher->group(3, &destText, length, status);
2121         REGEX_CHECK_STATUS;
2122         REGEX_ASSERT(result == &destText);
2123         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2124         REGEX_ASSERT(length == 2);
2125         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2126         utext_close(result);
2127
2128         // Capture Group number out of range.
2129         status = U_ZERO_ERROR;
2130         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2131         status = U_ZERO_ERROR;
2132         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2133         status = U_ZERO_ERROR;
2134         matcher->reset();
2135         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2136
2137         delete matcher;
2138         delete pat;
2139
2140         utext_close(&destText);
2141         utext_close(&input);
2142         utext_close(&re);
2143     }
2144
2145     //
2146     //  find
2147     //
2148     {
2149         int32_t             flags=0;
2150         UParseError         pe;
2151         UErrorCode          status=U_ZERO_ERROR;
2152         UText               re=UTEXT_INITIALIZER;
2153         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2154         utext_openUTF8(&re, str_abc, -1, &status);
2155
2156         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2157         REGEX_CHECK_STATUS;
2158         UText input = UTEXT_INITIALIZER;
2159         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2160         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2161         //                      012345678901234567
2162
2163         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2164         REGEX_CHECK_STATUS;
2165         REGEX_ASSERT(matcher->find());
2166         REGEX_ASSERT(matcher->start(status) == 1);
2167         REGEX_ASSERT(matcher->find());
2168         REGEX_ASSERT(matcher->start(status) == 6);
2169         REGEX_ASSERT(matcher->find());
2170         REGEX_ASSERT(matcher->start(status) == 12);
2171         REGEX_ASSERT(matcher->find() == FALSE);
2172         REGEX_ASSERT(matcher->find() == FALSE);
2173
2174         matcher->reset();
2175         REGEX_ASSERT(matcher->find());
2176         REGEX_ASSERT(matcher->start(status) == 1);
2177
2178         REGEX_ASSERT(matcher->find(0, status));
2179         REGEX_ASSERT(matcher->start(status) == 1);
2180         REGEX_ASSERT(matcher->find(1, status));
2181         REGEX_ASSERT(matcher->start(status) == 1);
2182         REGEX_ASSERT(matcher->find(2, status));
2183         REGEX_ASSERT(matcher->start(status) == 6);
2184         REGEX_ASSERT(matcher->find(12, status));
2185         REGEX_ASSERT(matcher->start(status) == 12);
2186         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2187         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2188         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2189         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2190
2191         status = U_ZERO_ERROR;
2192         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2193         status = U_ZERO_ERROR;
2194         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2195
2196         REGEX_ASSERT(matcher->groupCount() == 0);
2197
2198         delete matcher;
2199         delete pat;
2200
2201         utext_close(&input);
2202         utext_close(&re);
2203     }
2204
2205
2206     //
2207     //  find, with \G in pattern (true if at the end of a previous match).
2208     //
2209     {
2210         int32_t             flags=0;
2211         UParseError         pe;
2212         UErrorCode          status=U_ZERO_ERROR;
2213         UText               re=UTEXT_INITIALIZER;
2214         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2215         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2216
2217         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2218
2219         REGEX_CHECK_STATUS;
2220         UText input = UTEXT_INITIALIZER;
2221         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2222         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2223         //                      012345678901234567
2224
2225         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2226         REGEX_CHECK_STATUS;
2227         REGEX_ASSERT(matcher->find());
2228         REGEX_ASSERT(matcher->start(status) == 0);
2229         REGEX_ASSERT(matcher->start(1, status) == -1);
2230         REGEX_ASSERT(matcher->start(2, status) == 1);
2231
2232         REGEX_ASSERT(matcher->find());
2233         REGEX_ASSERT(matcher->start(status) == 4);
2234         REGEX_ASSERT(matcher->start(1, status) == 4);
2235         REGEX_ASSERT(matcher->start(2, status) == -1);
2236         REGEX_CHECK_STATUS;
2237
2238         delete matcher;
2239         delete pat;
2240
2241         utext_close(&input);
2242         utext_close(&re);
2243     }
2244
2245     //
2246     //   find with zero length matches, match position should bump ahead
2247     //     to prevent loops.
2248     //
2249     {
2250         int32_t                 i;
2251         UErrorCode          status=U_ZERO_ERROR;
2252         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2253                                                       //   using an always-true look-ahead.
2254         REGEX_CHECK_STATUS;
2255         UText s = UTEXT_INITIALIZER;
2256         utext_openUTF8(&s, "    ", -1, &status);
2257         m.reset(&s);
2258         for (i=0; ; i++) {
2259             if (m.find() == FALSE) {
2260                 break;
2261             }
2262             REGEX_ASSERT(m.start(status) == i);
2263             REGEX_ASSERT(m.end(status) == i);
2264         }
2265         REGEX_ASSERT(i==5);
2266
2267         // Check that the bump goes over characters outside the BMP OK
2268         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2269         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2270         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2271         m.reset(&s);
2272         for (i=0; ; i+=4) {
2273             if (m.find() == FALSE) {
2274                 break;
2275             }
2276             REGEX_ASSERT(m.start(status) == i);
2277             REGEX_ASSERT(m.end(status) == i);
2278         }
2279         REGEX_ASSERT(i==20);
2280
2281         utext_close(&s);
2282     }
2283     {
2284         // find() loop breaking test.
2285         //        with pattern of /.?/, should see a series of one char matches, then a single
2286         //        match of zero length at the end of the input string.
2287         int32_t                 i;
2288         UErrorCode          status=U_ZERO_ERROR;
2289         RegexMatcher        m(".?", 0, status);
2290         REGEX_CHECK_STATUS;
2291         UText s = UTEXT_INITIALIZER;
2292         utext_openUTF8(&s, "    ", -1, &status);
2293         m.reset(&s);
2294         for (i=0; ; i++) {
2295             if (m.find() == FALSE) {
2296                 break;
2297             }
2298             REGEX_ASSERT(m.start(status) == i);
2299             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2300         }
2301         REGEX_ASSERT(i==5);
2302
2303         utext_close(&s);
2304     }
2305
2306
2307     //
2308     // Matchers with no input string behave as if they had an empty input string.
2309     //
2310
2311     {
2312         UErrorCode status = U_ZERO_ERROR;
2313         RegexMatcher  m(".?", 0, status);
2314         REGEX_CHECK_STATUS;
2315         REGEX_ASSERT(m.find());
2316         REGEX_ASSERT(m.start(status) == 0);
2317         REGEX_ASSERT(m.input() == "");
2318     }
2319     {
2320         UErrorCode status = U_ZERO_ERROR;
2321         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2322         RegexMatcher  *m = p->matcher(status);
2323         REGEX_CHECK_STATUS;
2324
2325         REGEX_ASSERT(m->find() == FALSE);
2326         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2327         delete m;
2328         delete p;
2329     }
2330
2331     //
2332     // Regions
2333     //
2334     {
2335         UErrorCode status = U_ZERO_ERROR;
2336         UText testPattern = UTEXT_INITIALIZER;
2337         UText testText    = UTEXT_INITIALIZER;
2338         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2339         REGEX_VERBOSE_TEXT(&testPattern);
2340         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2341         REGEX_VERBOSE_TEXT(&testText);
2342
2343         RegexMatcher m(&testPattern, &testText, 0, status);
2344         REGEX_CHECK_STATUS;
2345         REGEX_ASSERT(m.regionStart() == 0);
2346         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2347         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2348         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2349
2350         m.region(2,4, status);
2351         REGEX_CHECK_STATUS;
2352         REGEX_ASSERT(m.matches(status));
2353         REGEX_ASSERT(m.start(status)==2);
2354         REGEX_ASSERT(m.end(status)==4);
2355         REGEX_CHECK_STATUS;
2356
2357         m.reset();
2358         REGEX_ASSERT(m.regionStart() == 0);
2359         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2360
2361         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2362         REGEX_VERBOSE_TEXT(&testText);
2363         m.reset(&testText);
2364         REGEX_ASSERT(m.regionStart() == 0);
2365         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2366
2367         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2368         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2369         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2370         REGEX_ASSERT(&m == &m.reset());
2371         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2372
2373         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2374         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2375         REGEX_ASSERT(&m == &m.reset());
2376         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2377
2378         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2379         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2380         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2381         REGEX_ASSERT(&m == &m.reset());
2382         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2383
2384         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2385         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2386         REGEX_ASSERT(&m == &m.reset());
2387         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2388
2389         utext_close(&testText);
2390         utext_close(&testPattern);
2391     }
2392
2393     //
2394     // hitEnd() and requireEnd()
2395     //
2396     {
2397         UErrorCode status = U_ZERO_ERROR;
2398         UText testPattern = UTEXT_INITIALIZER;
2399         UText testText    = UTEXT_INITIALIZER;
2400         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2401         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2402         utext_openUTF8(&testPattern, str_, -1, &status);
2403         utext_openUTF8(&testText, str_aabb, -1, &status);
2404
2405         RegexMatcher m1(&testPattern, &testText,  0, status);
2406         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2407         REGEX_ASSERT(m1.hitEnd() == TRUE);
2408         REGEX_ASSERT(m1.requireEnd() == FALSE);
2409         REGEX_CHECK_STATUS;
2410
2411         status = U_ZERO_ERROR;
2412         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2413         utext_openUTF8(&testPattern, str_a, -1, &status);
2414         RegexMatcher m2(&testPattern, &testText, 0, status);
2415         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2416         REGEX_ASSERT(m2.hitEnd() == FALSE);
2417         REGEX_ASSERT(m2.requireEnd() == FALSE);
2418         REGEX_CHECK_STATUS;
2419
2420         status = U_ZERO_ERROR;
2421         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2422         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2423         RegexMatcher m3(&testPattern, &testText, 0, status);
2424         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2425         REGEX_ASSERT(m3.hitEnd() == TRUE);
2426         REGEX_ASSERT(m3.requireEnd() == TRUE);
2427         REGEX_CHECK_STATUS;
2428
2429         utext_close(&testText);
2430         utext_close(&testPattern);
2431     }
2432 }
2433
2434
2435 //---------------------------------------------------------------------------
2436 //
2437 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2438 //                         Replace family of functions.
2439 //
2440 //---------------------------------------------------------------------------
2441 void RegexTest::API_Replace_UTF8() {
2442     //
2443     //  Replace
2444     //
2445     int32_t             flags=0;
2446     UParseError         pe;
2447     UErrorCode          status=U_ZERO_ERROR;
2448
2449     UText               re=UTEXT_INITIALIZER;
2450     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2451     REGEX_VERBOSE_TEXT(&re);
2452     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2453     REGEX_CHECK_STATUS;
2454
2455     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2456     //             012345678901234567
2457     UText dataText = UTEXT_INITIALIZER;
2458     utext_openUTF8(&dataText, data, -1, &status);
2459     REGEX_CHECK_STATUS;
2460     REGEX_VERBOSE_TEXT(&dataText);
2461     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2462
2463     //
2464     //  Plain vanilla matches.
2465     //
2466     UnicodeString  dest;
2467     UText destText = UTEXT_INITIALIZER;
2468     utext_openUnicodeString(&destText, &dest, &status);
2469     UText *result;
2470
2471     UText replText = UTEXT_INITIALIZER;
2472
2473     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2474     utext_openUTF8(&replText, str_yz, -1, &status);
2475     REGEX_VERBOSE_TEXT(&replText);
2476     result = matcher->replaceFirst(&replText, NULL, status);
2477     REGEX_CHECK_STATUS;
2478     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2479     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2480     utext_close(result);
2481     result = matcher->replaceFirst(&replText, &destText, status);
2482     REGEX_CHECK_STATUS;
2483     REGEX_ASSERT(result == &destText);
2484     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485
2486     result = matcher->replaceAll(&replText, NULL, status);
2487     REGEX_CHECK_STATUS;
2488     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2489     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2490     utext_close(result);
2491
2492     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2493     result = matcher->replaceAll(&replText, &destText, status);
2494     REGEX_CHECK_STATUS;
2495     REGEX_ASSERT(result == &destText);
2496     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2497
2498     //
2499     //  Plain vanilla non-matches.
2500     //
2501     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2502     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2503     matcher->reset(&dataText);
2504
2505     result = matcher->replaceFirst(&replText, NULL, status);
2506     REGEX_CHECK_STATUS;
2507     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2508     utext_close(result);
2509     result = matcher->replaceFirst(&replText, &destText, status);
2510     REGEX_CHECK_STATUS;
2511     REGEX_ASSERT(result == &destText);
2512     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513
2514     result = matcher->replaceAll(&replText, NULL, status);
2515     REGEX_CHECK_STATUS;
2516     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2517     utext_close(result);
2518     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2519     result = matcher->replaceAll(&replText, &destText, status);
2520     REGEX_CHECK_STATUS;
2521     REGEX_ASSERT(result == &destText);
2522     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2523
2524     //
2525     // Empty source string
2526     //
2527     utext_openUTF8(&dataText, NULL, 0, &status);
2528     matcher->reset(&dataText);
2529
2530     result = matcher->replaceFirst(&replText, NULL, status);
2531     REGEX_CHECK_STATUS;
2532     REGEX_ASSERT_UTEXT_UTF8("", result);
2533     utext_close(result);
2534     result = matcher->replaceFirst(&replText, &destText, status);
2535     REGEX_CHECK_STATUS;
2536     REGEX_ASSERT(result == &destText);
2537     REGEX_ASSERT_UTEXT_UTF8("", result);
2538
2539     result = matcher->replaceAll(&replText, NULL, status);
2540     REGEX_CHECK_STATUS;
2541     REGEX_ASSERT_UTEXT_UTF8("", result);
2542     utext_close(result);
2543     result = matcher->replaceAll(&replText, &destText, status);
2544     REGEX_CHECK_STATUS;
2545     REGEX_ASSERT(result == &destText);
2546     REGEX_ASSERT_UTEXT_UTF8("", result);
2547
2548     //
2549     // Empty substitution string
2550     //
2551     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2552     matcher->reset(&dataText);
2553
2554     utext_openUTF8(&replText, NULL, 0, &status);
2555     result = matcher->replaceFirst(&replText, NULL, status);
2556     REGEX_CHECK_STATUS;
2557     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2558     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2559     utext_close(result);
2560     result = matcher->replaceFirst(&replText, &destText, status);
2561     REGEX_CHECK_STATUS;
2562     REGEX_ASSERT(result == &destText);
2563     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564
2565     result = matcher->replaceAll(&replText, NULL, status);
2566     REGEX_CHECK_STATUS;
2567     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2568     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2569     utext_close(result);
2570     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2571     result = matcher->replaceAll(&replText, &destText, status);
2572     REGEX_CHECK_STATUS;
2573     REGEX_ASSERT(result == &destText);
2574     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2575
2576     //
2577     // match whole string
2578     //
2579     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2580     utext_openUTF8(&dataText, str_abc, -1, &status);
2581     matcher->reset(&dataText);
2582
2583     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2584     utext_openUTF8(&replText, str_xyz, -1, &status);
2585     result = matcher->replaceFirst(&replText, NULL, status);
2586     REGEX_CHECK_STATUS;
2587     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2588     utext_close(result);
2589     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2590     result = matcher->replaceFirst(&replText, &destText, status);
2591     REGEX_CHECK_STATUS;
2592     REGEX_ASSERT(result == &destText);
2593     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2594
2595     result = matcher->replaceAll(&replText, NULL, status);
2596     REGEX_CHECK_STATUS;
2597     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2598     utext_close(result);
2599     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2600     result = matcher->replaceAll(&replText, &destText, status);
2601     REGEX_CHECK_STATUS;
2602     REGEX_ASSERT(result == &destText);
2603     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2604
2605     //
2606     // Capture Group, simple case
2607     //
2608     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2609     utext_openUTF8(&re, str_add, -1, &status);
2610     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2611     REGEX_CHECK_STATUS;
2612
2613     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2614     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2615     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2616     REGEX_CHECK_STATUS;
2617
2618     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2619     utext_openUTF8(&replText, str_11, -1, &status);
2620     result = matcher2->replaceFirst(&replText, NULL, status);
2621     REGEX_CHECK_STATUS;
2622     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2623     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2624     utext_close(result);
2625     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2626     result = matcher2->replaceFirst(&replText, &destText, status);
2627     REGEX_CHECK_STATUS;
2628     REGEX_ASSERT(result == &destText);
2629     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2630
2631     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2632     utext_openUTF8(&replText, str_v, -1, &status);
2633     REGEX_VERBOSE_TEXT(&replText);
2634     result = matcher2->replaceFirst(&replText, NULL, status);
2635     REGEX_CHECK_STATUS;
2636     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2637     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2638     utext_close(result);
2639     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2640     result = matcher2->replaceFirst(&replText, &destText, status);
2641     REGEX_CHECK_STATUS;
2642     REGEX_ASSERT(result == &destText);
2643     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2644
2645     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2646                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2647                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2648     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2649     result = matcher2->replaceFirst(&replText, NULL, status);
2650     REGEX_CHECK_STATUS;
2651     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2652     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2653     utext_close(result);
2654     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2655     result = matcher2->replaceFirst(&replText, &destText, status);
2656     REGEX_CHECK_STATUS;
2657     REGEX_ASSERT(result == &destText);
2658     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2659
2660     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2661     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2662     //                                 012345678901234567890123456
2663     supplDigitChars[22] = 0xF0;
2664     supplDigitChars[23] = 0x9D;
2665     supplDigitChars[24] = 0x9F;
2666     supplDigitChars[25] = 0x8F;
2667     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2668
2669     result = matcher2->replaceFirst(&replText, NULL, status);
2670     REGEX_CHECK_STATUS;
2671     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2672     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2673     utext_close(result);
2674     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2675     result = matcher2->replaceFirst(&replText, &destText, status);
2676     REGEX_CHECK_STATUS;
2677     REGEX_ASSERT(result == &destText);
2678     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2679     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2680     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2681     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2682 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2683     utext_close(result);
2684     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2685     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2686     REGEX_ASSERT(result == &destText);
2687 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688
2689     //
2690     // Replacement String with \u hex escapes
2691     //
2692     {
2693       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2694       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2695         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2696         utext_openUTF8(&replText, str_u0043, -1, &status);
2697         matcher->reset(&dataText);
2698
2699         result = matcher->replaceAll(&replText, NULL, status);
2700         REGEX_CHECK_STATUS;
2701         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2702         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2703         utext_close(result);
2704         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2705         result = matcher->replaceAll(&replText, &destText, status);
2706         REGEX_CHECK_STATUS;
2707         REGEX_ASSERT(result == &destText);
2708         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2709     }
2710     {
2711       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2712         utext_openUTF8(&dataText, str_abc, -1, &status);
2713         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2714         utext_openUTF8(&replText, str_U00010000, -1, &status);
2715         matcher->reset(&dataText);
2716
2717         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2718         //                          0123456789
2719         expected[2] = 0xF0;
2720         expected[3] = 0x90;
2721         expected[4] = 0x80;
2722         expected[5] = 0x80;
2723
2724         result = matcher->replaceAll(&replText, NULL, status);
2725         REGEX_CHECK_STATUS;
2726         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2727         utext_close(result);
2728         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2729         result = matcher->replaceAll(&replText, &destText, status);
2730         REGEX_CHECK_STATUS;
2731         REGEX_ASSERT(result == &destText);
2732         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2733     }
2734     // TODO:  need more through testing of capture substitutions.
2735
2736     // Bug 4057
2737     //
2738     {
2739         status = U_ZERO_ERROR;
2740 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2741 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2742 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2743         utext_openUTF8(&re, str_ssee, -1, &status);
2744         utext_openUTF8(&dataText, str_blah, -1, &status);
2745         utext_openUTF8(&replText, str_ooh, -1, &status);
2746
2747         RegexMatcher m(&re, 0, status);
2748         REGEX_CHECK_STATUS;
2749
2750         UnicodeString result;
2751         UText resultText = UTEXT_INITIALIZER;
2752         utext_openUnicodeString(&resultText, &result, &status);
2753
2754         // Multiple finds do NOT bump up the previous appendReplacement postion.
2755         m.reset(&dataText);
2756         m.find();
2757         m.find();
2758         m.appendReplacement(&resultText, &replText, status);
2759         REGEX_CHECK_STATUS;
2760         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2761         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2762
2763         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2764         status = U_ZERO_ERROR;
2765         result.truncate(0);
2766         utext_openUnicodeString(&resultText, &result, &status);
2767         m.reset(10, status);
2768         m.find();
2769         m.find();
2770         m.appendReplacement(&resultText, &replText, status);
2771         REGEX_CHECK_STATUS;
2772         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2773         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2774
2775         // find() at interior of string, appendReplacement still starts at beginning.
2776         status = U_ZERO_ERROR;
2777         result.truncate(0);
2778         utext_openUnicodeString(&resultText, &result, &status);
2779         m.reset();
2780         m.find(10, status);
2781         m.find();
2782         m.appendReplacement(&resultText, &replText, status);
2783         REGEX_CHECK_STATUS;
2784         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2785         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2786
2787         m.appendTail(&resultText, status);
2788         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2789         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2790
2791         utext_close(&resultText);
2792     }
2793
2794     delete matcher2;
2795     delete pat2;
2796     delete matcher;
2797     delete pat;
2798
2799     utext_close(&dataText);
2800     utext_close(&replText);
2801     utext_close(&destText);
2802     utext_close(&re);
2803 }
2804
2805
2806 //---------------------------------------------------------------------------
2807 //
2808 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2809 //                        present and nominally working.
2810 //
2811 //---------------------------------------------------------------------------
2812 void RegexTest::API_Pattern_UTF8() {
2813     RegexPattern        pata;    // Test default constructor to not crash.
2814     RegexPattern        patb;
2815
2816     REGEX_ASSERT(pata == patb);
2817     REGEX_ASSERT(pata == pata);
2818
2819     UText         re1 = UTEXT_INITIALIZER;
2820     UText         re2 = UTEXT_INITIALIZER;
2821     UErrorCode    status = U_ZERO_ERROR;
2822     UParseError   pe;
2823
2824     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2825     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2826     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2827     utext_openUTF8(&re2, str_def, -1, &status);
2828
2829     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2830     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2831     REGEX_CHECK_STATUS;
2832     REGEX_ASSERT(*pat1 == *pat1);
2833     REGEX_ASSERT(*pat1 != pata);
2834
2835     // Assign
2836     patb = *pat1;
2837     REGEX_ASSERT(patb == *pat1);
2838
2839     // Copy Construct
2840     RegexPattern patc(*pat1);
2841     REGEX_ASSERT(patc == *pat1);
2842     REGEX_ASSERT(patb == patc);
2843     REGEX_ASSERT(pat1 != pat2);
2844     patb = *pat2;
2845     REGEX_ASSERT(patb != patc);
2846     REGEX_ASSERT(patb == *pat2);
2847
2848     // Compile with no flags.
2849     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2850     REGEX_ASSERT(*pat1a == *pat1);
2851
2852     REGEX_ASSERT(pat1a->flags() == 0);
2853
2854     // Compile with different flags should be not equal
2855     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2856     REGEX_CHECK_STATUS;
2857
2858     REGEX_ASSERT(*pat1b != *pat1a);
2859     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2860     REGEX_ASSERT(pat1a->flags() == 0);
2861     delete pat1b;
2862
2863     // clone
2864     RegexPattern *pat1c = pat1->clone();
2865     REGEX_ASSERT(*pat1c == *pat1);
2866     REGEX_ASSERT(*pat1c != *pat2);
2867
2868     delete pat1c;
2869     delete pat1a;
2870     delete pat1;
2871     delete pat2;
2872
2873     utext_close(&re1);
2874     utext_close(&re2);
2875
2876
2877     //
2878     //   Verify that a matcher created from a cloned pattern works.
2879     //     (Jitterbug 3423)
2880     //
2881     {
2882         UErrorCode     status     = U_ZERO_ERROR;
2883         UText          pattern    = UTEXT_INITIALIZER;
2884         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2885         utext_openUTF8(&pattern, str_pL, -1, &status);
2886
2887         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2888         RegexPattern  *pClone     = pSource->clone();
2889         delete         pSource;
2890         RegexMatcher  *mFromClone = pClone->matcher(status);
2891         REGEX_CHECK_STATUS;
2892
2893         UText          input      = UTEXT_INITIALIZER;
2894         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2895         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2896         mFromClone->reset(&input);
2897         REGEX_ASSERT(mFromClone->find() == TRUE);
2898         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2899         REGEX_ASSERT(mFromClone->find() == TRUE);
2900         REGEX_ASSERT(mFromClone->group(status) == "World");
2901         REGEX_ASSERT(mFromClone->find() == FALSE);
2902         delete mFromClone;
2903         delete pClone;
2904
2905         utext_close(&input);
2906         utext_close(&pattern);
2907     }
2908
2909     //
2910     //   matches convenience API
2911     //
2912     {
2913         UErrorCode status  = U_ZERO_ERROR;
2914         UText      pattern = UTEXT_INITIALIZER;
2915         UText      input   = UTEXT_INITIALIZER;
2916
2917         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2918         utext_openUTF8(&input, str_randominput, -1, &status);
2919
2920         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2921         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2922         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2923         REGEX_CHECK_STATUS;
2924
2925         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2926         utext_openUTF8(&pattern, str_abc, -1, &status);
2927         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2928         REGEX_CHECK_STATUS;
2929
2930         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2931         utext_openUTF8(&pattern, str_nput, -1, &status);
2932         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2933         REGEX_CHECK_STATUS;
2934
2935         utext_openUTF8(&pattern, str_randominput, -1, &status);
2936         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2937         REGEX_CHECK_STATUS;
2938
2939         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2940         utext_openUTF8(&pattern, str_u, -1, &status);
2941         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2942         REGEX_CHECK_STATUS;
2943
2944         utext_openUTF8(&input, str_abc, -1, &status);
2945         utext_openUTF8(&pattern, str_abc, -1, &status);
2946         status = U_INDEX_OUTOFBOUNDS_ERROR;
2947         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2948         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2949
2950         utext_close(&input);
2951         utext_close(&pattern);
2952     }
2953
2954
2955     //
2956     // Split()
2957     //
2958     status = U_ZERO_ERROR;
2959     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2960     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2961     pat1 = RegexPattern::compile(&re1, pe, status);
2962     REGEX_CHECK_STATUS;
2963     UnicodeString  fields[10];
2964
2965     int32_t n;
2966     n = pat1->split("Now is the time", fields, 10, status);
2967     REGEX_CHECK_STATUS;
2968     REGEX_ASSERT(n==4);
2969     REGEX_ASSERT(fields[0]=="Now");
2970     REGEX_ASSERT(fields[1]=="is");
2971     REGEX_ASSERT(fields[2]=="the");
2972     REGEX_ASSERT(fields[3]=="time");
2973     REGEX_ASSERT(fields[4]=="");
2974
2975     n = pat1->split("Now is the time", fields, 2, status);
2976     REGEX_CHECK_STATUS;
2977     REGEX_ASSERT(n==2);
2978     REGEX_ASSERT(fields[0]=="Now");
2979     REGEX_ASSERT(fields[1]=="is the time");
2980     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2981
2982     fields[1] = "*";
2983     status = U_ZERO_ERROR;
2984     n = pat1->split("Now is the time", fields, 1, status);
2985     REGEX_CHECK_STATUS;
2986     REGEX_ASSERT(n==1);
2987     REGEX_ASSERT(fields[0]=="Now is the time");
2988     REGEX_ASSERT(fields[1]=="*");
2989     status = U_ZERO_ERROR;
2990
2991     n = pat1->split("    Now       is the time   ", fields, 10, status);
2992     REGEX_CHECK_STATUS;
2993     REGEX_ASSERT(n==6);
2994     REGEX_ASSERT(fields[0]=="");
2995     REGEX_ASSERT(fields[1]=="Now");
2996     REGEX_ASSERT(fields[2]=="is");
2997     REGEX_ASSERT(fields[3]=="the");
2998     REGEX_ASSERT(fields[4]=="time");
2999     REGEX_ASSERT(fields[5]=="");
3000     REGEX_ASSERT(fields[6]=="");
3001
3002     fields[2] = "*";
3003     n = pat1->split("     ", fields, 10, status);
3004     REGEX_CHECK_STATUS;
3005     REGEX_ASSERT(n==2);
3006     REGEX_ASSERT(fields[0]=="");
3007     REGEX_ASSERT(fields[1]=="");
3008     REGEX_ASSERT(fields[2]=="*");
3009
3010     fields[0] = "foo";
3011     n = pat1->split("", fields, 10, status);
3012     REGEX_CHECK_STATUS;
3013     REGEX_ASSERT(n==0);
3014     REGEX_ASSERT(fields[0]=="foo");
3015
3016     delete pat1;
3017
3018     //  split, with a pattern with (capture)
3019     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3020     pat1 = RegexPattern::compile(&re1,  pe, status);
3021     REGEX_CHECK_STATUS;
3022
3023     status = U_ZERO_ERROR;
3024     fields[6] = fields[7] = "*";
3025     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3026     REGEX_CHECK_STATUS;
3027     REGEX_ASSERT(n==7);
3028     REGEX_ASSERT(fields[0]=="");
3029     REGEX_ASSERT(fields[1]=="a");
3030     REGEX_ASSERT(fields[2]=="Now is ");
3031     REGEX_ASSERT(fields[3]=="b");
3032     REGEX_ASSERT(fields[4]=="the time");
3033     REGEX_ASSERT(fields[5]=="c");
3034     REGEX_ASSERT(fields[6]=="");
3035     REGEX_ASSERT(fields[7]=="*");
3036     REGEX_ASSERT(status==U_ZERO_ERROR);
3037
3038     fields[6] = fields[7] = "*";
3039     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3040     REGEX_CHECK_STATUS;
3041     REGEX_ASSERT(n==7);
3042     REGEX_ASSERT(fields[0]=="  ");
3043     REGEX_ASSERT(fields[1]=="a");
3044     REGEX_ASSERT(fields[2]=="Now is ");
3045     REGEX_ASSERT(fields[3]=="b");
3046     REGEX_ASSERT(fields[4]=="the time");
3047     REGEX_ASSERT(fields[5]=="c");
3048     REGEX_ASSERT(fields[6]=="");
3049     REGEX_ASSERT(fields[7]=="*");
3050
3051     status = U_ZERO_ERROR;
3052     fields[6] = "foo";
3053     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3054     REGEX_CHECK_STATUS;
3055     REGEX_ASSERT(n==6);
3056     REGEX_ASSERT(fields[0]=="  ");
3057     REGEX_ASSERT(fields[1]=="a");
3058     REGEX_ASSERT(fields[2]=="Now is ");
3059     REGEX_ASSERT(fields[3]=="b");
3060     REGEX_ASSERT(fields[4]=="the time");
3061     REGEX_ASSERT(fields[5]==" ");
3062     REGEX_ASSERT(fields[6]=="foo");
3063
3064     status = U_ZERO_ERROR;
3065     fields[5] = "foo";
3066     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3067     REGEX_CHECK_STATUS;
3068     REGEX_ASSERT(n==5);
3069     REGEX_ASSERT(fields[0]=="  ");
3070     REGEX_ASSERT(fields[1]=="a");
3071     REGEX_ASSERT(fields[2]=="Now is ");
3072     REGEX_ASSERT(fields[3]=="b");
3073     REGEX_ASSERT(fields[4]=="the time<c>");
3074     REGEX_ASSERT(fields[5]=="foo");
3075
3076     status = U_ZERO_ERROR;
3077     fields[5] = "foo";
3078     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3079     REGEX_CHECK_STATUS;
3080     REGEX_ASSERT(n==5);
3081     REGEX_ASSERT(fields[0]=="  ");
3082     REGEX_ASSERT(fields[1]=="a");
3083     REGEX_ASSERT(fields[2]=="Now is ");
3084     REGEX_ASSERT(fields[3]=="b");
3085     REGEX_ASSERT(fields[4]=="the time");
3086     REGEX_ASSERT(fields[5]=="foo");
3087
3088     status = U_ZERO_ERROR;
3089     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3090     REGEX_CHECK_STATUS;
3091     REGEX_ASSERT(n==4);
3092     REGEX_ASSERT(fields[0]=="  ");
3093     REGEX_ASSERT(fields[1]=="a");
3094     REGEX_ASSERT(fields[2]=="Now is ");
3095     REGEX_ASSERT(fields[3]=="the time<c>");
3096     status = U_ZERO_ERROR;
3097     delete pat1;
3098
3099     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3100     pat1 = RegexPattern::compile(&re1, pe, status);
3101     REGEX_CHECK_STATUS;
3102     n = pat1->split("1-10,20", fields, 10, status);
3103     REGEX_CHECK_STATUS;
3104     REGEX_ASSERT(n==5);
3105     REGEX_ASSERT(fields[0]=="1");
3106     REGEX_ASSERT(fields[1]=="-");
3107     REGEX_ASSERT(fields[2]=="10");
3108     REGEX_ASSERT(fields[3]==",");
3109     REGEX_ASSERT(fields[4]=="20");
3110     delete pat1;
3111
3112
3113     //
3114     // split of a UText based string, with library allocating output UTexts.
3115     //
3116     {
3117         status = U_ZERO_ERROR;
3118         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3119         UnicodeString stringToSplit("first:second:third");
3120         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3121         REGEX_CHECK_STATUS;
3122
3123         UText *splits[10] = {NULL};
3124         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3125         REGEX_CHECK_STATUS;
3126         REGEX_ASSERT(numFields == 5);
3127         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3128         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3129         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3130         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3131         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3132         REGEX_ASSERT(splits[5] == NULL);
3133
3134         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3135             if (splits[i]) {
3136                 utext_close(splits[i]);
3137                 splits[i] = NULL;
3138             }
3139         }
3140         utext_close(textToSplit);
3141     }
3142
3143
3144     //
3145     // RegexPattern::pattern() and patternText()
3146     //
3147     pat1 = new RegexPattern();
3148     REGEX_ASSERT(pat1->pattern() == "");
3149     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3150     delete pat1;
3151     const char *helloWorldInvariant = "(Hello, world)*";
3152     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3153     pat1 = RegexPattern::compile(&re1, pe, status);
3154     REGEX_CHECK_STATUS;
3155     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3156     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3157     delete pat1;
3158
3159     utext_close(&re1);
3160 }
3161
3162
3163 //---------------------------------------------------------------------------
3164 //
3165 //      Extended       A more thorough check for features of regex patterns
3166 //                     The test cases are in a separate data file,
3167 //                       source/tests/testdata/regextst.txt
3168 //                     A description of the test data format is included in that file.
3169 //
3170 //---------------------------------------------------------------------------
3171
3172 const char *
3173 RegexTest::getPath(char buffer[2048], const char *filename) {
3174     UErrorCode status=U_ZERO_ERROR;
3175     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3176     if (U_FAILURE(status)) {
3177         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3178         return NULL;
3179     }
3180
3181     strcpy(buffer, testDataDirectory);
3182     strcat(buffer, filename);
3183     return buffer;
3184 }
3185
3186 void RegexTest::Extended() {
3187     char tdd[2048];
3188     const char *srcPath;
3189     UErrorCode  status  = U_ZERO_ERROR;
3190     int32_t     lineNum = 0;
3191
3192     //
3193     //  Open and read the test data file.
3194     //
3195     srcPath=getPath(tdd, "regextst.txt");
3196     if(srcPath==NULL) {
3197         return; /* something went wrong, error already output */
3198     }
3199
3200     int32_t    len;
3201     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3202     if (U_FAILURE(status)) {
3203         return; /* something went wrong, error already output */
3204     }
3205
3206     //
3207     //  Put the test data into a UnicodeString
3208     //
3209     UnicodeString testString(FALSE, testData, len);
3210
3211     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3212     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3213     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3214
3215     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3216     UnicodeString   testPattern;   // The pattern for test from the test file.
3217     UnicodeString   testFlags;     // the flags   for a test.
3218     UnicodeString   matchString;   // The marked up string to be used as input
3219
3220     if (U_FAILURE(status)){
3221         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3222         delete [] testData;
3223         return;
3224     }
3225
3226     //
3227     //  Loop over the test data file, once per line.
3228     //
3229     while (lineMat.find()) {
3230         lineNum++;
3231         if (U_FAILURE(status)) {
3232           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3233         }
3234
3235         status = U_ZERO_ERROR;
3236         UnicodeString testLine = lineMat.group(1, status);
3237         if (testLine.length() == 0) {
3238             continue;
3239         }
3240
3241         //
3242         // Parse the test line.  Skip blank and comment only lines.
3243         // Separate out the three main fields - pattern, flags, target.
3244         //
3245
3246         commentMat.reset(testLine);
3247         if (commentMat.lookingAt(status)) {
3248             // This line is a comment, or blank.
3249             continue;
3250         }
3251
3252         //
3253         //  Pull out the pattern field, remove it from the test file line.
3254         //
3255         quotedStuffMat.reset(testLine);
3256         if (quotedStuffMat.lookingAt(status)) {
3257             testPattern = quotedStuffMat.group(2, status);
3258             testLine.remove(0, quotedStuffMat.end(0, status));
3259         } else {
3260             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3261             continue;
3262         }
3263
3264
3265         //
3266         //  Pull out the flags from the test file line.
3267         //
3268         flagsMat.reset(testLine);
3269         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3270         testFlags = flagsMat.group(1, status);
3271         if (flagsMat.group(2, status).length() > 0) {
3272             errln("Bad Match flag at line %d. Scanning %c\n",
3273                 lineNum, flagsMat.group(2, status).charAt(0));
3274             continue;
3275         }
3276         testLine.remove(0, flagsMat.end(0, status));
3277
3278         //
3279         //  Pull out the match string, as a whole.
3280         //    We'll process the <tags> later.
3281         //
3282         quotedStuffMat.reset(testLine);
3283         if (quotedStuffMat.lookingAt(status)) {
3284             matchString = quotedStuffMat.group(2, status);
3285             testLine.remove(0, quotedStuffMat.end(0, status));
3286         } else {
3287             errln("Bad match string at test file line %d", lineNum);
3288             continue;
3289         }
3290
3291         //
3292         //  The only thing left from the input line should be an optional trailing comment.
3293         //
3294         commentMat.reset(testLine);
3295         if (commentMat.lookingAt(status) == FALSE) {
3296             errln("Line %d: unexpected characters at end of test line.", lineNum);
3297             continue;
3298         }
3299
3300         //
3301         //  Run the test
3302         //
3303         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3304     }
3305
3306     delete [] testData;
3307
3308 }
3309
3310
3311
3312 //---------------------------------------------------------------------------
3313 //
3314 //    regex_find(pattern, flags, inputString, lineNumber)
3315 //
3316 //         Function to run a single test from the Extended (data driven) tests.
3317 //         See file test/testdata/regextst.txt for a description of the
3318 //         pattern and inputString fields, and the allowed flags.
3319 //         lineNumber is the source line in regextst.txt of the test.
3320 //
3321 //---------------------------------------------------------------------------
3322
3323
3324 //  Set a value into a UVector at position specified by a decimal number in
3325 //   a UnicodeString.   This is a utility function needed by the actual test function,
3326 //   which follows.
3327 static void set(UVector &vec, int32_t val, UnicodeString index) {
3328     UErrorCode  status=U_ZERO_ERROR;
3329     int32_t  idx = 0;
3330     for (int32_t i=0; i<index.length(); i++) {
3331         int32_t d=u_charDigitValue(index.charAt(i));
3332         if (d<0) {return;}
3333         idx = idx*10 + d;
3334     }
3335     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3336     vec.setElementAt(val, idx);
3337 }
3338
3339 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3340     UErrorCode  status=U_ZERO_ERROR;
3341     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3342     vec.setElementAt(val, idx);
3343 }
3344
3345 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3346 {
3347     UBool couldFind = TRUE;
3348     UTEXT_SETNATIVEINDEX(utext, 0);
3349     int32_t i = 0;
3350     while (i < unistrOffset) {
3351         UChar32 c = UTEXT_NEXT32(utext);
3352         if (c != U_SENTINEL) {
3353             i += U16_LENGTH(c);
3354         } else {
3355             couldFind = FALSE;
3356             break;
3357         }
3358     }
3359     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3360     return couldFind;
3361 }
3362
3363
3364 void RegexTest::regex_find(const UnicodeString &pattern,
3365                            const UnicodeString &flags,
3366                            const UnicodeString &inputString,
3367                            const char *srcPath,
3368                            int32_t line) {
3369     UnicodeString       unEscapedInput;
3370     UnicodeString       deTaggedInput;
3371
3372     int32_t             patternUTF8Length,      inputUTF8Length;
3373     char                *patternChars  = NULL, *inputChars = NULL;
3374     UText               patternText    = UTEXT_INITIALIZER;
3375     UText               inputText      = UTEXT_INITIALIZER;
3376     UConverter          *UTF8Converter = NULL;
3377
3378     UErrorCode          status         = U_ZERO_ERROR;
3379     UParseError         pe;
3380     RegexPattern        *parsePat      = NULL;
3381     RegexMatcher        *parseMatcher  = NULL;
3382     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3383     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3384     UVector             groupStarts(status);
3385     UVector             groupEnds(status);
3386     UVector             groupStartsUTF8(status);
3387     UVector             groupEndsUTF8(status);
3388     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3389     UBool               failed         = FALSE;
3390     int32_t             numFinds;
3391     int32_t             i;
3392     UBool               useMatchesFunc   = FALSE;
3393     UBool               useLookingAtFunc = FALSE;
3394     int32_t             regionStart      = -1;
3395     int32_t             regionEnd        = -1;
3396     int32_t             regionStartUTF8  = -1;
3397     int32_t             regionEndUTF8    = -1;
3398
3399
3400     //
3401     //  Compile the caller's pattern
3402     //
3403     uint32_t bflags = 0;
3404     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3405         bflags |= UREGEX_CASE_INSENSITIVE;
3406     }
3407     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3408         bflags |= UREGEX_COMMENTS;
3409     }
3410     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3411         bflags |= UREGEX_DOTALL;
3412     }
3413     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3414         bflags |= UREGEX_MULTILINE;
3415     }
3416
3417     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3418         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3419     }
3420     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3421         bflags |= UREGEX_UNIX_LINES;
3422     }
3423     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3424         bflags |= UREGEX_LITERAL;
3425     }
3426
3427
3428     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3429     if (status != U_ZERO_ERROR) {
3430         #if UCONFIG_NO_BREAK_ITERATION==1
3431         // 'v' test flag means that the test pattern should not compile if ICU was configured
3432         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3433         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3434             goto cleanupAndReturn;
3435         }
3436         #endif
3437         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3438             // Expected pattern compilation error.
3439             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3440                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3441             }
3442             goto cleanupAndReturn;
3443         } else {
3444             // Unexpected pattern compilation error.
3445             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3446             goto cleanupAndReturn;
3447         }
3448     }
3449
3450     UTF8Converter = ucnv_open("UTF8", &status);
3451     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3452
3453     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3454     status = U_ZERO_ERROR; // buffer overflow
3455     patternChars = new char[patternUTF8Length+1];
3456     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3457     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3458
3459     if (status == U_ZERO_ERROR) {
3460         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3461
3462         if (status != U_ZERO_ERROR) {
3463 #if UCONFIG_NO_BREAK_ITERATION==1
3464             // 'v' test flag means that the test pattern should not compile if ICU was configured
3465             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3466             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3467                 goto cleanupAndReturn;
3468             }
3469 #endif
3470             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3471                 // Expected pattern compilation error.
3472                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3473                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3474                 }
3475                 goto cleanupAndReturn;
3476             } else {
3477                 // Unexpected pattern compilation error.
3478                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3479                 goto cleanupAndReturn;
3480             }
3481         }
3482     }
3483
3484     if (UTF8Pattern == NULL) {
3485         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3486         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3487         status = U_ZERO_ERROR;
3488     }
3489
3490     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3491         callerPattern->dumpPattern();
3492     }
3493
3494     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3495         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3496         goto cleanupAndReturn;
3497     }
3498
3499
3500     //
3501     // Number of times find() should be called on the test string, default to 1
3502     //
3503     numFinds = 1;
3504     for (i=2; i<=9; i++) {
3505         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3506             if (numFinds != 1) {
3507                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3508                 goto cleanupAndReturn;
3509             }
3510             numFinds = i;
3511         }
3512     }
3513
3514     // 'M' flag.  Use matches() instead of find()
3515     if (flags.indexOf((UChar)0x4d) >= 0) {
3516         useMatchesFunc = TRUE;
3517     }
3518     if (flags.indexOf((UChar)0x4c) >= 0) {
3519         useLookingAtFunc = TRUE;
3520     }
3521
3522     //
3523     //  Find the tags in the input data, remove them, and record the group boundary
3524     //    positions.
3525     //
3526     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3527     REGEX_CHECK_STATUS_L(line);
3528
3529     unEscapedInput = inputString.unescape();
3530     parseMatcher = parsePat->matcher(unEscapedInput, status);
3531     REGEX_CHECK_STATUS_L(line);
3532     while(parseMatcher->find()) {
3533         parseMatcher->appendReplacement(deTaggedInput, "", status);
3534         REGEX_CHECK_STATUS;
3535         UnicodeString groupNum = parseMatcher->group(2, status);
3536         if (groupNum == "r") {
3537             // <r> or </r>, a region specification within the string
3538             if (parseMatcher->group(1, status) == "/") {
3539                 regionEnd = deTaggedInput.length();
3540             } else {
3541                 regionStart = deTaggedInput.length();
3542             }
3543         } else {
3544             // <digits> or </digits>, a group match boundary tag.
3545             if (parseMatcher->group(1, status) == "/") {
3546                 set(groupEnds, deTaggedInput.length(), groupNum);
3547             } else {
3548                 set(groupStarts, deTaggedInput.length(), groupNum);
3549             }
3550         }
3551     }
3552     parseMatcher->appendTail(deTaggedInput);
3553     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3554     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3555       errln("mismatched <r> tags");
3556       failed = TRUE;
3557       goto cleanupAndReturn;
3558     }
3559
3560     //
3561     //  Configure the matcher according to the flags specified with this test.
3562     //
3563     matcher = callerPattern->matcher(deTaggedInput, status);
3564     REGEX_CHECK_STATUS_L(line);
3565     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3566         matcher->setTrace(TRUE);
3567     }
3568
3569     if (UTF8Pattern != NULL) {
3570         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3571         status = U_ZERO_ERROR; // buffer overflow
3572         inputChars = new char[inputUTF8Length+1];
3573         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3574         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3575
3576         if (status == U_ZERO_ERROR) {
3577             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3578             REGEX_CHECK_STATUS_L(line);
3579         }
3580
3581         if (UTF8Matcher == NULL) {
3582             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3583           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3584             status = U_ZERO_ERROR;
3585         }
3586     }
3587
3588     //
3589     //  Generate native indices for UTF8 versions of region and capture group info
3590     //
3591     if (UTF8Matcher != NULL) {
3592         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3593         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3594
3595         //  Fill out the native index UVector info.
3596         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3597         for (i=0; i<groupStarts.size(); i++) {
3598             int32_t  start = groupStarts.elementAti(i);
3599             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3600             if (start >= 0) {
3601                 int32_t  startUTF8;
3602                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3603                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3604                     failed = TRUE;
3605                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3606                 }
3607                 setInt(groupStartsUTF8, startUTF8, i);
3608             }
3609
3610             int32_t  end = groupEnds.elementAti(i);
3611             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3612             if (end >= 0) {
3613                 int32_t  endUTF8;
3614                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3615                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3616                     failed = TRUE;
3617                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3618                 }
3619                 setInt(groupEndsUTF8, endUTF8, i);
3620             }
3621         }
3622     }
3623
3624     if (regionStart>=0) {
3625        matcher->region(regionStart, regionEnd, status);
3626        REGEX_CHECK_STATUS_L(line);
3627        if (UTF8Matcher != NULL) {
3628            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3629            REGEX_CHECK_STATUS_L(line);
3630        }
3631     }
3632     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3633         matcher->useAnchoringBounds(FALSE);
3634         if (UTF8Matcher != NULL) {
3635             UTF8Matcher->useAnchoringBounds(FALSE);
3636         }
3637     }
3638     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3639         matcher->useTransparentBounds(TRUE);
3640         if (UTF8Matcher != NULL) {
3641             UTF8Matcher->useTransparentBounds(TRUE);
3642         }
3643     }
3644
3645
3646
3647     //
3648     // Do a find on the de-tagged input using the caller's pattern
3649     //     TODO: error on count>1 and not find().
3650     //           error on both matches() and lookingAt().
3651     //
3652     for (i=0; i<numFinds; i++) {
3653         if (useMatchesFunc) {
3654             isMatch = matcher->matches(status);
3655             if (UTF8Matcher != NULL) {
3656                isUTF8Match = UTF8Matcher->matches(status);
3657             }
3658         } else  if (useLookingAtFunc) {
3659             isMatch = matcher->lookingAt(status);
3660             if (UTF8Matcher != NULL) {
3661                 isUTF8Match = UTF8Matcher->lookingAt(status);
3662             }
3663         } else {
3664             isMatch = matcher->find();
3665             if (UTF8Matcher != NULL) {
3666                 isUTF8Match = UTF8Matcher->find();
3667             }
3668         }
3669     }
3670     matcher->setTrace(FALSE);
3671     if (U_FAILURE(status)) {
3672         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3673     }
3674
3675     //
3676     // Match up the groups from the find() with the groups from the tags
3677     //
3678
3679     // number of tags should match number of groups from find operation.
3680     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3681     //   G option in test means that capture group data is not available in the
3682     //     expected results, so the check needs to be suppressed.
3683     if (isMatch == FALSE && groupStarts.size() != 0) {
3684         dataerrln("Error at line %d:  Match expected, but none found.", line);
3685         failed = TRUE;
3686         goto cleanupAndReturn;
3687     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3688         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3689         failed = TRUE;
3690         goto cleanupAndReturn;
3691     }
3692
3693     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694         // Only check for match / no match.  Don't check capture groups.
3695         if (isMatch && groupStarts.size() == 0) {
3696             errln("Error at line %d:  No match expected, but one found.", line);
3697             failed = TRUE;
3698         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3699             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3700             failed = TRUE;
3701         }
3702         goto cleanupAndReturn;
3703     }
3704
3705     REGEX_CHECK_STATUS_L(line);
3706     for (i=0; i<=matcher->groupCount(); i++) {
3707         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3708         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3709         if (matcher->start(i, status) != expectedStart) {
3710             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3711                 line, i, expectedStart, matcher->start(i, status));
3712             failed = TRUE;
3713             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3714         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3715             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3716                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3717             failed = TRUE;
3718             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3719         }
3720
3721         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3722         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3723         if (matcher->end(i, status) != expectedEnd) {
3724             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3725                 line, i, expectedEnd, matcher->end(i, status));
3726             failed = TRUE;
3727             // Error on end position;  keep going; real error is probably yet to come as group
3728             //   end positions work from end of the input data towards the front.
3729         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3730             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3731                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3732             failed = TRUE;
3733             // Error on end position;  keep going; real error is probably yet to come as group
3734             //   end positions work from end of the input data towards the front.
3735         }
3736     }
3737     if ( matcher->groupCount()+1 < groupStarts.size()) {
3738         errln("Error at line %d: Expected %d capture groups, found %d.",
3739             line, groupStarts.size()-1, matcher->groupCount());
3740         failed = TRUE;
3741         }
3742     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3743         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3744               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3745         failed = TRUE;
3746     }
3747
3748     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3749         matcher->requireEnd() == TRUE) {
3750         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3751         failed = TRUE;
3752     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3753         UTF8Matcher->requireEnd() == TRUE) {
3754         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3755         failed = TRUE;
3756     }
3757
3758     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3759         matcher->requireEnd() == FALSE) {
3760         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3761         failed = TRUE;
3762     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3763         UTF8Matcher->requireEnd() == FALSE) {
3764         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3765         failed = TRUE;
3766     }
3767
3768     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3769         matcher->hitEnd() == TRUE) {
3770         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3771         failed = TRUE;
3772     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3773                UTF8Matcher->hitEnd() == TRUE) {
3774         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3775         failed = TRUE;
3776     }
3777
3778     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3779         matcher->hitEnd() == FALSE) {
3780         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3781         failed = TRUE;
3782     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3783                UTF8Matcher->hitEnd() == FALSE) {
3784         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3785         failed = TRUE;
3786     }
3787
3788
3789 cleanupAndReturn:
3790     if (failed) {
3791         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3792             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3793         // callerPattern->dump();
3794     }
3795     delete parseMatcher;
3796     delete parsePat;
3797     delete UTF8Matcher;
3798     delete UTF8Pattern;
3799     delete matcher;
3800     delete callerPattern;
3801
3802     utext_close(&inputText);
3803     delete[] inputChars;
3804     utext_close(&patternText);
3805     delete[] patternChars;
3806     ucnv_close(UTF8Converter);
3807 }
3808
3809
3810
3811
3812 //---------------------------------------------------------------------------
3813 //
3814 //      Errors     Check for error handling in patterns.
3815 //
3816 //---------------------------------------------------------------------------
3817 void RegexTest::Errors() {
3818     // \escape sequences that aren't implemented yet.
3819     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3820
3821     // Missing close parentheses
3822     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3823     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3824     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3825
3826     // Extra close paren
3827     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3828     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3829     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3830
3831     // Look-ahead, Look-behind
3832     //  TODO:  add tests for unbounded length look-behinds.
3833     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3834
3835     // Attempt to use non-default flags
3836     {
3837         UParseError   pe;
3838         UErrorCode    status = U_ZERO_ERROR;
3839         int32_t       flags  = UREGEX_CANON_EQ |
3840                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3841                                UREGEX_MULTILINE;
3842         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3843         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3844         delete pat1;
3845     }
3846
3847
3848     // Quantifiers are allowed only after something that can be quantified.
3849     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3850     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3851     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3852
3853     // Mal-formed {min,max} quantifiers
3854     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3855     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3856     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3857     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3858     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3859     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3860     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3861     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3862     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3863
3864     // Ticket 5389
3865     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3866
3867     // Invalid Back Reference \0
3868     //    For ICU 3.8 and earlier
3869     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3870     //
3871     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3872
3873 }
3874
3875
3876 //-------------------------------------------------------------------------------
3877 //
3878 //  Read a text data file, convert it to UChars, and return the data
3879 //    in one big UChar * buffer, which the caller must delete.
3880 //
3881 //--------------------------------------------------------------------------------
3882 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3883                                      const char *defEncoding, UErrorCode &status) {
3884     UChar       *retPtr  = NULL;
3885     char        *fileBuf = NULL;
3886     UConverter* conv     = NULL;
3887     FILE        *f       = NULL;
3888
3889     ulen = 0;
3890     if (U_FAILURE(status)) {
3891         return retPtr;
3892     }
3893
3894     //
3895     //  Open the file.
3896     //
3897     f = fopen(fileName, "rb");
3898     if (f == 0) {
3899         dataerrln("Error opening test data file %s\n", fileName);
3900         status = U_FILE_ACCESS_ERROR;
3901         return NULL;
3902     }
3903     //
3904     //  Read it in
3905     //
3906     int32_t            fileSize;
3907     int32_t            amt_read;
3908
3909     fseek( f, 0, SEEK_END);
3910     fileSize = ftell(f);
3911     fileBuf = new char[fileSize];
3912     fseek(f, 0, SEEK_SET);
3913     amt_read = fread(fileBuf, 1, fileSize, f);
3914     if (amt_read != fileSize || fileSize <= 0) {
3915         errln("Error reading test data file.");
3916         goto cleanUpAndReturn;
3917     }
3918
3919     //
3920     // Look for a Unicode Signature (BOM) on the data just read
3921     //
3922     int32_t        signatureLength;
3923     const char *   fileBufC;
3924     const char*    encoding;
3925
3926     fileBufC = fileBuf;
3927     encoding = ucnv_detectUnicodeSignature(
3928         fileBuf, fileSize, &signatureLength, &status);
3929     if(encoding!=NULL ){
3930         fileBufC  += signatureLength;
3931         fileSize  -= signatureLength;
3932     } else {
3933         encoding = defEncoding;
3934         if (strcmp(encoding, "utf-8") == 0) {
3935             errln("file %s is missing its BOM", fileName);
3936         }
3937     }
3938
3939     //
3940     // Open a converter to take the rule file to UTF-16
3941     //
3942     conv = ucnv_open(encoding, &status);
3943     if (U_FAILURE(status)) {
3944         goto cleanUpAndReturn;
3945     }
3946
3947     //
3948     // Convert the rules to UChar.
3949     //  Preflight first to determine required buffer size.
3950     //
3951     ulen = ucnv_toUChars(conv,
3952         NULL,           //  dest,
3953         0,              //  destCapacity,
3954         fileBufC,
3955         fileSize,
3956         &status);
3957     if (status == U_BUFFER_OVERFLOW_ERROR) {
3958         // Buffer Overflow is expected from the preflight operation.
3959         status = U_ZERO_ERROR;
3960
3961         retPtr = new UChar[ulen+1];
3962         ucnv_toUChars(conv,
3963             retPtr,       //  dest,
3964             ulen+1,
3965             fileBufC,
3966             fileSize,
3967             &status);
3968     }
3969
3970 cleanUpAndReturn:
3971     fclose(f);
3972     delete[] fileBuf;
3973     ucnv_close(conv);
3974     if (U_FAILURE(status)) {
3975         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3976         delete []retPtr;
3977         retPtr = 0;
3978         ulen   = 0;
3979     };
3980     return retPtr;
3981 }
3982
3983
3984 //-------------------------------------------------------------------------------
3985 //
3986 //   PerlTests  - Run Perl's regular expression tests
3987 //                The input file for this test is re_tests, the standard regular
3988 //                expression test data distributed with the Perl source code.
3989 //
3990 //                Here is Perl's description of the test data file:
3991 //
3992 //        # The tests are in a separate file 't/op/re_tests'.
3993 //        # Each line in that file is a separate test.
3994 //        # There are five columns, separated by tabs.
3995 //        #
3996 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3997 //        # Modifiers can be put after the closing C<'>.
3998 //        #
3999 //        # Column 2 contains the string to be matched.
4000 //        #
4001 //        # Column 3 contains the expected result:
4002 //        #     y   expect a match
4003 //        #     n   expect no match
4004 //        #     c   expect an error
4005 //        # B   test exposes a known bug in Perl, should be skipped
4006 //        # b   test exposes a known bug in Perl, should be skipped if noamp
4007 //        #
4008 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4009 //        #
4010 //        # Column 4 contains a string, usually C<$&>.
4011 //        #
4012 //        # Column 5 contains the expected result of double-quote
4013 //        # interpolating that string after the match, or start of error message.
4014 //        #
4015 //        # Column 6, if present, contains a reason why the test is skipped.
4016 //        # This is printed with "skipped", for harness to pick up.
4017 //        #
4018 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4019 //        #
4020 //        # If you want to add a regular expression test that can't be expressed
4021 //        # in this format, don't add it here: put it in op/pat.t instead.
4022 //
4023 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4024 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4025 //        (The i is in addition to whatever was there before.)
4026 //
4027 //-------------------------------------------------------------------------------
4028 void RegexTest::PerlTests() {
4029     char tdd[2048];
4030     const char *srcPath;
4031     UErrorCode  status = U_ZERO_ERROR;
4032     UParseError pe;
4033
4034     //
4035     //  Open and read the test data file.
4036     //
4037     srcPath=getPath(tdd, "re_tests.txt");
4038     if(srcPath==NULL) {
4039         return; /* something went wrong, error already output */
4040     }
4041
4042     int32_t    len;
4043     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4044     if (U_FAILURE(status)) {
4045         return; /* something went wrong, error already output */
4046     }
4047
4048     //
4049     //  Put the test data into a UnicodeString
4050     //
4051     UnicodeString testDataString(FALSE, testData, len);
4052
4053     //
4054     //  Regex to break the input file into lines, and strip the new lines.
4055     //     One line per match, capture group one is the desired data.
4056     //
4057     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4058     if (U_FAILURE(status)) {
4059         dataerrln("RegexPattern::compile() error");
4060         return;
4061     }
4062     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4063
4064     //
4065     //  Regex to split a test file line into fields.
4066     //    There are six fields, separated by tabs.
4067     //
4068     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4069
4070     //
4071     //  Regex to identify test patterns with flag settings, and to separate them.
4072     //    Test patterns with flags look like 'pattern'i
4073     //    Test patterns without flags are not quoted:   pattern
4074     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4075     //
4076     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4077     RegexMatcher* flagMat = flagPat->matcher(status);
4078
4079     //
4080     // The Perl tests reference several perl-isms, which are evaluated/substituted
4081     //   in the test data.  Not being perl, this must be done explicitly.  Here
4082     //   are string constants and REs for these constructs.
4083     //
4084     UnicodeString nulnulSrc("${nulnul}");
4085     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4086     nulnul = nulnul.unescape();
4087
4088     UnicodeString ffffSrc("${ffff}");
4089     UnicodeString ffff("\\uffff", -1, US_INV);
4090     ffff = ffff.unescape();
4091
4092     //  regexp for $-[0], $+[2], etc.
4093     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4094     RegexMatcher *groupsMat = groupsPat->matcher(status);
4095
4096     //  regexp for $0, $1, $2, etc.
4097     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4098     RegexMatcher *cgMat = cgPat->matcher(status);
4099
4100
4101     //
4102     // Main Loop for the Perl Tests, runs once per line from the
4103     //   test data file.
4104     //
4105     int32_t  lineNum = 0;
4106     int32_t  skippedUnimplementedCount = 0;
4107     while (lineMat->find()) {
4108         lineNum++;
4109
4110         //
4111         //  Get a line, break it into its fields, do the Perl
4112         //    variable substitutions.
4113         //
4114         UnicodeString line = lineMat->group(1, status);
4115         UnicodeString fields[7];
4116         fieldPat->split(line, fields, 7, status);
4117
4118         flagMat->reset(fields[0]);
4119         flagMat->matches(status);
4120         UnicodeString pattern  = flagMat->group(2, status);
4121         pattern.findAndReplace("${bang}", "!");
4122         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4123         pattern.findAndReplace(ffffSrc, ffff);
4124
4125         //
4126         //  Identify patterns that include match flag settings,
4127         //    split off the flags, remove the extra quotes.
4128         //
4129         UnicodeString flagStr = flagMat->group(3, status);
4130         if (U_FAILURE(status)) {
4131             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4132             return;
4133         }
4134         int32_t flags = 0;
4135         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4136         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4137         const UChar UChar_m = 0x6d;
4138         const UChar UChar_x = 0x78;
4139         const UChar UChar_y = 0x79;
4140         if (flagStr.indexOf(UChar_i) != -1) {
4141             flags |= UREGEX_CASE_INSENSITIVE;
4142         }
4143         if (flagStr.indexOf(UChar_m) != -1) {
4144             flags |= UREGEX_MULTILINE;
4145         }
4146         if (flagStr.indexOf(UChar_x) != -1) {
4147             flags |= UREGEX_COMMENTS;
4148         }
4149
4150         //
4151         // Compile the test pattern.
4152         //
4153         status = U_ZERO_ERROR;
4154         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4155         if (status == U_REGEX_UNIMPLEMENTED) {
4156             //
4157             // Test of a feature that is planned for ICU, but not yet implemented.
4158             //   skip the test.
4159             skippedUnimplementedCount++;
4160             delete testPat;
4161             status = U_ZERO_ERROR;
4162             continue;
4163         }
4164
4165         if (U_FAILURE(status)) {
4166             // Some tests are supposed to generate errors.
4167             //   Only report an error for tests that are supposed to succeed.
4168             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4169                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4170             {
4171                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4172             }
4173             status = U_ZERO_ERROR;
4174             delete testPat;
4175             continue;
4176         }
4177
4178         if (fields[2].indexOf(UChar_i) >= 0) {
4179             // ICU should skip this test.
4180             delete testPat;
4181             continue;
4182         }
4183
4184         if (fields[2].indexOf(UChar_c) >= 0) {
4185             // This pattern should have caused a compilation error, but didn't/
4186             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4187             delete testPat;
4188             continue;
4189         }
4190
4191         //
4192         // replace the Perl variables that appear in some of the
4193         //   match data strings.
4194         //
4195         UnicodeString matchString = fields[1];
4196         matchString.findAndReplace(nulnulSrc, nulnul);
4197         matchString.findAndReplace(ffffSrc,   ffff);
4198
4199         // Replace any \n in the match string with an actual new-line char.
4200         //  Don't do full unescape, as this unescapes more than Perl does, which
4201         //  causes other spurious failures in the tests.
4202         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4203
4204
4205
4206         //
4207         // Run the test, check for expected match/don't match result.
4208         //
4209         RegexMatcher *testMat = testPat->matcher(matchString, status);
4210         UBool found = testMat->find();
4211         UBool expected = FALSE;
4212         if (fields[2].indexOf(UChar_y) >=0) {
4213             expected = TRUE;
4214         }
4215         if (expected != found) {
4216             errln("line %d: Expected %smatch, got %smatch",
4217                 lineNum, expected?"":"no ", found?"":"no " );
4218             continue;
4219         }
4220
4221         // Don't try to check expected results if there is no match.
4222         //   (Some have stuff in the expected fields)
4223         if (!found) {
4224             delete testMat;
4225             delete testPat;
4226             continue;
4227         }
4228
4229         //
4230         // Interpret the Perl expression from the fourth field of the data file,
4231         // building up an ICU string from the results of the ICU match.
4232         //   The Perl expression will contain references to the results of
4233         //     a regex match, including the matched string, capture group strings,
4234         //     group starting and ending indicies, etc.
4235         //
4236         UnicodeString resultString;
4237         UnicodeString perlExpr = fields[3];
4238 #if SUPPORT_MUTATING_INPUT_STRING
4239         groupsMat->reset(perlExpr);
4240         cgMat->reset(perlExpr);
4241 #endif
4242
4243         while (perlExpr.length() > 0) {
4244 #if !SUPPORT_MUTATING_INPUT_STRING
4245             //  Perferred usage.  Reset after any modification to input string.
4246             groupsMat->reset(perlExpr);
4247             cgMat->reset(perlExpr);
4248 #endif
4249
4250             if (perlExpr.startsWith("$&")) {
4251                 resultString.append(testMat->group(status));
4252                 perlExpr.remove(0, 2);
4253             }
4254
4255             else if (groupsMat->lookingAt(status)) {
4256                 // $-[0]   $+[2]  etc.
4257                 UnicodeString digitString = groupsMat->group(2, status);
4258                 int32_t t = 0;
4259                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4260                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4261                 int32_t matchPosition;
4262                 if (plusOrMinus.compare("+") == 0) {
4263                     matchPosition = testMat->end(groupNum, status);
4264                 } else {
4265                     matchPosition = testMat->start(groupNum, status);
4266                 }
4267                 if (matchPosition != -1) {
4268                     ICU_Utility::appendNumber(resultString, matchPosition);
4269                 }
4270                 perlExpr.remove(0, groupsMat->end(status));
4271             }
4272
4273             else if (cgMat->lookingAt(status)) {
4274                 // $1, $2, $3, etc.
4275                 UnicodeString digitString = cgMat->group(1, status);
4276                 int32_t t = 0;
4277                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4278                 if (U_SUCCESS(status)) {
4279                     resultString.append(testMat->group(groupNum, status));
4280                     status = U_ZERO_ERROR;
4281                 }
4282                 perlExpr.remove(0, cgMat->end(status));
4283             }
4284
4285             else if (perlExpr.startsWith("@-")) {
4286                 int32_t i;
4287                 for (i=0; i<=testMat->groupCount(); i++) {
4288                     if (i>0) {
4289                         resultString.append(" ");
4290                     }
4291                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4292                 }
4293                 perlExpr.remove(0, 2);
4294             }
4295
4296             else if (perlExpr.startsWith("@+")) {
4297                 int32_t i;
4298                 for (i=0; i<=testMat->groupCount(); i++) {
4299                     if (i>0) {
4300                         resultString.append(" ");
4301                     }
4302                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4303                 }
4304                 perlExpr.remove(0, 2);
4305             }
4306
4307             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4308                                                      //           or as an escaped sequence (e.g. \n)
4309                 if (perlExpr.length() > 1) {
4310                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4311                 }
4312                 UChar c = perlExpr.charAt(0);
4313                 switch (c) {
4314                 case 'n':   c = '\n'; break;
4315                 // add any other escape sequences that show up in the test expected results.
4316                 }
4317                 resultString.append(c);
4318                 perlExpr.remove(0, 1);
4319             }
4320
4321             else  {
4322                 // Any characters from the perl expression that we don't explicitly
4323                 //  recognize before here are assumed to be literals and copied
4324                 //  as-is to the expected results.
4325                 resultString.append(perlExpr.charAt(0));
4326                 perlExpr.remove(0, 1);
4327             }
4328
4329             if (U_FAILURE(status)) {
4330                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4331                 break;
4332             }
4333         }
4334
4335         //
4336         // Expected Results Compare
4337         //
4338         UnicodeString expectedS(fields[4]);
4339         expectedS.findAndReplace(nulnulSrc, nulnul);
4340         expectedS.findAndReplace(ffffSrc,   ffff);
4341         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4342
4343
4344         if (expectedS.compare(resultString) != 0) {
4345             err("Line %d: Incorrect perl expression results.", lineNum);
4346             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4347         }
4348
4349         delete testMat;
4350         delete testPat;
4351     }
4352
4353     //
4354     // All done.  Clean up allocated stuff.
4355     //
4356     delete cgMat;
4357     delete cgPat;
4358
4359     delete groupsMat;
4360     delete groupsPat;
4361
4362     delete flagMat;
4363     delete flagPat;
4364
4365     delete lineMat;
4366     delete linePat;
4367
4368     delete fieldPat;
4369     delete [] testData;
4370
4371
4372     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4373
4374 }
4375
4376
4377 //-------------------------------------------------------------------------------
4378 //
4379 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4380 //                  (instead of using UnicodeStrings) to test the alternate engine.
4381 //                  The input file for this test is re_tests, the standard regular
4382 //                  expression test data distributed with the Perl source code.
4383 //                  See PerlTests() for more information.
4384 //
4385 //-------------------------------------------------------------------------------
4386 void RegexTest::PerlTestsUTF8() {
4387     char tdd[2048];
4388     const char *srcPath;
4389     UErrorCode  status = U_ZERO_ERROR;
4390     UParseError pe;
4391     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4392     UText       patternText = UTEXT_INITIALIZER;
4393     char       *patternChars = NULL;
4394     int32_t     patternLength;
4395     int32_t     patternCapacity = 0;
4396     UText       inputText = UTEXT_INITIALIZER;
4397     char       *inputChars = NULL;
4398     int32_t     inputLength;
4399     int32_t     inputCapacity = 0;
4400
4401     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4402
4403     //
4404     //  Open and read the test data file.
4405     //
4406     srcPath=getPath(tdd, "re_tests.txt");
4407     if(srcPath==NULL) {
4408         return; /* something went wrong, error already output */
4409     }
4410
4411     int32_t    len;
4412     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4413     if (U_FAILURE(status)) {
4414         return; /* something went wrong, error already output */
4415     }
4416
4417     //
4418     //  Put the test data into a UnicodeString
4419     //
4420     UnicodeString testDataString(FALSE, testData, len);
4421
4422     //
4423     //  Regex to break the input file into lines, and strip the new lines.
4424     //     One line per match, capture group one is the desired data.
4425     //
4426     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4427     if (U_FAILURE(status)) {
4428         dataerrln("RegexPattern::compile() error");
4429         return;
4430     }
4431     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4432
4433     //
4434     //  Regex to split a test file line into fields.
4435     //    There are six fields, separated by tabs.
4436     //
4437     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4438
4439     //
4440     //  Regex to identify test patterns with flag settings, and to separate them.
4441     //    Test patterns with flags look like 'pattern'i
4442     //    Test patterns without flags are not quoted:   pattern
4443     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4444     //
4445     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4446     RegexMatcher* flagMat = flagPat->matcher(status);
4447
4448     //
4449     // The Perl tests reference several perl-isms, which are evaluated/substituted
4450     //   in the test data.  Not being perl, this must be done explicitly.  Here
4451     //   are string constants and REs for these constructs.
4452     //
4453     UnicodeString nulnulSrc("${nulnul}");
4454     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4455     nulnul = nulnul.unescape();
4456
4457     UnicodeString ffffSrc("${ffff}");
4458     UnicodeString ffff("\\uffff", -1, US_INV);
4459     ffff = ffff.unescape();
4460
4461     //  regexp for $-[0], $+[2], etc.
4462     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4463     RegexMatcher *groupsMat = groupsPat->matcher(status);
4464
4465     //  regexp for $0, $1, $2, etc.
4466     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4467     RegexMatcher *cgMat = cgPat->matcher(status);
4468
4469
4470     //
4471     // Main Loop for the Perl Tests, runs once per line from the
4472     //   test data file.
4473     //
4474     int32_t  lineNum = 0;
4475     int32_t  skippedUnimplementedCount = 0;
4476     while (lineMat->find()) {
4477         lineNum++;
4478
4479         //
4480         //  Get a line, break it into its fields, do the Perl
4481         //    variable substitutions.
4482         //
4483         UnicodeString line = lineMat->group(1, status);
4484         UnicodeString fields[7];
4485         fieldPat->split(line, fields, 7, status);
4486
4487         flagMat->reset(fields[0]);
4488         flagMat->matches(status);
4489         UnicodeString pattern  = flagMat->group(2, status);
4490         pattern.findAndReplace("${bang}", "!");
4491         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4492         pattern.findAndReplace(ffffSrc, ffff);
4493
4494         //
4495         //  Identify patterns that include match flag settings,
4496         //    split off the flags, remove the extra quotes.
4497         //
4498         UnicodeString flagStr = flagMat->group(3, status);
4499         if (U_FAILURE(status)) {
4500             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4501             return;
4502         }
4503         int32_t flags = 0;
4504         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4505         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4506         const UChar UChar_m = 0x6d;
4507         const UChar UChar_x = 0x78;
4508         const UChar UChar_y = 0x79;
4509         if (flagStr.indexOf(UChar_i) != -1) {
4510             flags |= UREGEX_CASE_INSENSITIVE;
4511         }
4512         if (flagStr.indexOf(UChar_m) != -1) {
4513             flags |= UREGEX_MULTILINE;
4514         }
4515         if (flagStr.indexOf(UChar_x) != -1) {
4516             flags |= UREGEX_COMMENTS;
4517         }
4518
4519         //
4520         // Put the pattern in a UTF-8 UText
4521         //
4522         status = U_ZERO_ERROR;
4523         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4524         if (status == U_BUFFER_OVERFLOW_ERROR) {
4525             status = U_ZERO_ERROR;
4526             delete[] patternChars;
4527             patternCapacity = patternLength + 1;
4528             patternChars = new char[patternCapacity];
4529             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4530         }
4531         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4532
4533         //
4534         // Compile the test pattern.
4535         //
4536         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4537         if (status == U_REGEX_UNIMPLEMENTED) {
4538             //
4539             // Test of a feature that is planned for ICU, but not yet implemented.
4540             //   skip the test.
4541             skippedUnimplementedCount++;
4542             delete testPat;
4543             status = U_ZERO_ERROR;
4544             continue;
4545         }
4546
4547         if (U_FAILURE(status)) {
4548             // Some tests are supposed to generate errors.
4549             //   Only report an error for tests that are supposed to succeed.
4550             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4551                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4552             {
4553                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4554             }
4555             status = U_ZERO_ERROR;
4556             delete testPat;
4557             continue;
4558         }
4559
4560         if (fields[2].indexOf(UChar_i) >= 0) {
4561             // ICU should skip this test.
4562             delete testPat;
4563             continue;
4564         }
4565
4566         if (fields[2].indexOf(UChar_c) >= 0) {
4567             // This pattern should have caused a compilation error, but didn't/
4568             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4569             delete testPat;
4570             continue;
4571         }
4572
4573
4574         //
4575         // replace the Perl variables that appear in some of the
4576         //   match data strings.
4577         //
4578         UnicodeString matchString = fields[1];
4579         matchString.findAndReplace(nulnulSrc, nulnul);
4580         matchString.findAndReplace(ffffSrc,   ffff);
4581
4582         // Replace any \n in the match string with an actual new-line char.
4583         //  Don't do full unescape, as this unescapes more than Perl does, which
4584         //  causes other spurious failures in the tests.
4585         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4586
4587         //
4588         // Put the input in a UTF-8 UText
4589         //
4590         status = U_ZERO_ERROR;
4591         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4592         if (status == U_BUFFER_OVERFLOW_ERROR) {
4593             status = U_ZERO_ERROR;
4594             delete[] inputChars;
4595             inputCapacity = inputLength + 1;
4596             inputChars = new char[inputCapacity];
4597             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4598         }
4599         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4600
4601         //
4602         // Run the test, check for expected match/don't match result.
4603         //
4604         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4605         UBool found = testMat->find();
4606         UBool expected = FALSE;
4607         if (fields[2].indexOf(UChar_y) >=0) {
4608             expected = TRUE;
4609         }
4610         if (expected != found) {
4611             errln("line %d: Expected %smatch, got %smatch",
4612                 lineNum, expected?"":"no ", found?"":"no " );
4613             continue;
4614         }
4615
4616         // Don't try to check expected results if there is no match.
4617         //   (Some have stuff in the expected fields)
4618         if (!found) {
4619             delete testMat;
4620             delete testPat;
4621             continue;
4622         }
4623
4624         //
4625         // Interpret the Perl expression from the fourth field of the data file,
4626         // building up an ICU string from the results of the ICU match.
4627         //   The Perl expression will contain references to the results of
4628         //     a regex match, including the matched string, capture group strings,
4629         //     group starting and ending indicies, etc.
4630         //
4631         UnicodeString resultString;
4632         UnicodeString perlExpr = fields[3];
4633
4634         while (perlExpr.length() > 0) {
4635             groupsMat->reset(perlExpr);
4636             cgMat->reset(perlExpr);
4637
4638             if (perlExpr.startsWith("$&")) {
4639                 resultString.append(testMat->group(status));
4640                 perlExpr.remove(0, 2);
4641             }
4642
4643             else if (groupsMat->lookingAt(status)) {
4644                 // $-[0]   $+[2]  etc.
4645                 UnicodeString digitString = groupsMat->group(2, status);
4646                 int32_t t = 0;
4647                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4648                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4649                 int32_t matchPosition;
4650                 if (plusOrMinus.compare("+") == 0) {
4651                     matchPosition = testMat->end(groupNum, status);
4652                 } else {
4653                     matchPosition = testMat->start(groupNum, status);
4654                 }
4655                 if (matchPosition != -1) {
4656                     ICU_Utility::appendNumber(resultString, matchPosition);
4657                 }
4658                 perlExpr.remove(0, groupsMat->end(status));
4659             }
4660
4661             else if (cgMat->lookingAt(status)) {
4662                 // $1, $2, $3, etc.
4663                 UnicodeString digitString = cgMat->group(1, status);
4664                 int32_t t = 0;
4665                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4666                 if (U_SUCCESS(status)) {
4667                     resultString.append(testMat->group(groupNum, status));
4668                     status = U_ZERO_ERROR;
4669                 }
4670                 perlExpr.remove(0, cgMat->end(status));
4671             }
4672
4673             else if (perlExpr.startsWith("@-")) {
4674                 int32_t i;
4675                 for (i=0; i<=testMat->groupCount(); i++) {
4676                     if (i>0) {
4677                         resultString.append(" ");
4678                     }
4679                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4680                 }
4681                 perlExpr.remove(0, 2);
4682             }
4683
4684             else if (perlExpr.startsWith("@+")) {
4685                 int32_t i;
4686                 for (i=0; i<=testMat->groupCount(); i++) {
4687                     if (i>0) {
4688                         resultString.append(" ");
4689                     }
4690                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4691                 }
4692                 perlExpr.remove(0, 2);
4693             }
4694
4695             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4696                                                      //           or as an escaped sequence (e.g. \n)
4697                 if (perlExpr.length() > 1) {
4698                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4699                 }
4700                 UChar c = perlExpr.charAt(0);
4701                 switch (c) {
4702                 case 'n':   c = '\n'; break;
4703                 // add any other escape sequences that show up in the test expected results.
4704                 }
4705                 resultString.append(c);
4706                 perlExpr.remove(0, 1);
4707             }
4708
4709             else  {
4710                 // Any characters from the perl expression that we don't explicitly
4711                 //  recognize before here are assumed to be literals and copied
4712                 //  as-is to the expected results.
4713                 resultString.append(perlExpr.charAt(0));
4714                 perlExpr.remove(0, 1);
4715             }
4716
4717             if (U_FAILURE(status)) {
4718                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4719                 break;
4720             }
4721         }
4722
4723         //
4724         // Expected Results Compare
4725         //
4726         UnicodeString expectedS(fields[4]);
4727         expectedS.findAndReplace(nulnulSrc, nulnul);
4728         expectedS.findAndReplace(ffffSrc,   ffff);
4729         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4730
4731
4732         if (expectedS.compare(resultString) != 0) {
4733             err("Line %d: Incorrect perl expression results.", lineNum);
4734             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4735         }
4736
4737         delete testMat;
4738         delete testPat;
4739     }
4740
4741     //
4742     // All done.  Clean up allocated stuff.
4743     //
4744     delete cgMat;
4745     delete cgPat;
4746
4747     delete groupsMat;
4748     delete groupsPat;
4749
4750     delete flagMat;
4751     delete flagPat;
4752
4753     delete lineMat;
4754     delete linePat;
4755
4756     delete fieldPat;
4757     delete [] testData;
4758
4759     utext_close(&patternText);
4760     utext_close(&inputText);
4761
4762     delete [] patternChars;
4763     delete [] inputChars;
4764
4765
4766     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4767
4768 }
4769
4770
4771 //--------------------------------------------------------------
4772 //
4773 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4774 //             Use this pattern,
4775 //                 "(a?){1,8000000}"
4776 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4777 //                   This test is likely to be fragile, as further optimizations stop
4778 //                   more cases of pointless looping in the match engine.
4779 //
4780 //---------------------------------------------------------------
4781 void RegexTest::Bug6149() {
4782     UnicodeString pattern("(a?){1,8000000}");
4783     UnicodeString s("xyz");
4784     uint32_t flags = 0;
4785     UErrorCode status = U_ZERO_ERROR;
4786
4787     RegexMatcher  matcher(pattern, s, flags, status);
4788     UBool result = false;
4789     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4790     REGEX_ASSERT(result == FALSE);
4791  }
4792
4793
4794 //
4795 //   Callbacks()    Test the callback function.
4796 //                  When set, callbacks occur periodically during matching operations,
4797 //                  giving the application code the ability to abort the operation
4798 //                  before it's normal completion.
4799 //
4800
4801 struct callBackContext {
4802     RegexTest        *test;
4803     int32_t          maxCalls;
4804     int32_t          numCalls;
4805     int32_t          lastSteps;
4806     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4807 };
4808
4809 U_CDECL_BEGIN
4810 static UBool U_CALLCONV
4811 testCallBackFn(const void *context, int32_t steps) {
4812     callBackContext  *info = (callBackContext *)context;
4813     if (info->lastSteps+1 != steps) {
4814         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4815     }
4816     info->lastSteps = steps;
4817     info->numCalls++;
4818     return (info->numCalls < info->maxCalls);
4819 }
4820 U_CDECL_END
4821
4822 void RegexTest::Callbacks() {
4823    {
4824         // Getter returns NULLs if no callback has been set
4825
4826         //   The variables that the getter will fill in.
4827         //   Init to non-null values so that the action of the getter can be seen.
4828         const void          *returnedContext = &returnedContext;
4829         URegexMatchCallback *returnedFn = &testCallBackFn;
4830
4831         UErrorCode status = U_ZERO_ERROR;
4832         RegexMatcher matcher("x", 0, status);
4833         REGEX_CHECK_STATUS;
4834         matcher.getMatchCallback(returnedFn, returnedContext, status);
4835         REGEX_CHECK_STATUS;
4836         REGEX_ASSERT(returnedFn == NULL);
4837         REGEX_ASSERT(returnedContext == NULL);
4838     }
4839
4840    {
4841         // Set and Get work
4842         callBackContext cbInfo = {this, 0, 0, 0};
4843         const void          *returnedContext;
4844         URegexMatchCallback *returnedFn;
4845         UErrorCode status = U_ZERO_ERROR;
4846         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4847         REGEX_CHECK_STATUS;
4848         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4849         REGEX_CHECK_STATUS;
4850         matcher.getMatchCallback(returnedFn, returnedContext, status);
4851         REGEX_CHECK_STATUS;
4852         REGEX_ASSERT(returnedFn == testCallBackFn);
4853         REGEX_ASSERT(returnedContext == &cbInfo);
4854
4855         // A short-running match shouldn't invoke the callback
4856         status = U_ZERO_ERROR;
4857         cbInfo.reset(1);
4858         UnicodeString s = "xxx";
4859         matcher.reset(s);
4860         REGEX_ASSERT(matcher.matches(status));
4861         REGEX_CHECK_STATUS;
4862         REGEX_ASSERT(cbInfo.numCalls == 0);
4863
4864         // A medium-length match that runs long enough to invoke the
4865         //   callback, but not so long that the callback aborts it.
4866         status = U_ZERO_ERROR;
4867         cbInfo.reset(4);
4868         s = "aaaaaaaaaaaaaaaaaaab";
4869         matcher.reset(s);
4870         REGEX_ASSERT(matcher.matches(status)==FALSE);
4871         REGEX_CHECK_STATUS;
4872         REGEX_ASSERT(cbInfo.numCalls > 0);
4873
4874         // A longer running match that the callback function will abort.
4875         status = U_ZERO_ERROR;
4876         cbInfo.reset(4);
4877         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4878         matcher.reset(s);
4879         REGEX_ASSERT(matcher.matches(status)==FALSE);
4880         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4881         REGEX_ASSERT(cbInfo.numCalls == 4);
4882
4883         // A longer running find that the callback function will abort.
4884         status = U_ZERO_ERROR;
4885         cbInfo.reset(4);
4886         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4887         matcher.reset(s);
4888         REGEX_ASSERT(matcher.find(status)==FALSE);
4889         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4890         REGEX_ASSERT(cbInfo.numCalls == 4);
4891     }
4892
4893
4894 }
4895
4896
4897 //
4898 //   FindProgressCallbacks()    Test the find "progress" callback function.
4899 //                  When set, the find progress callback will be invoked during a find operations
4900 //                  after each return from a match attempt, giving the application the opportunity
4901 //                  to terminate a long-running find operation before it's normal completion.
4902 //
4903
4904 struct progressCallBackContext {
4905     RegexTest        *test;
4906     int64_t          lastIndex;
4907     int32_t          maxCalls;
4908     int32_t          numCalls;
4909     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4910 };
4911
4912 // call-back function for find().
4913 // Return TRUE to continue the find().
4914 // Return FALSE to stop the find().
4915 U_CDECL_BEGIN
4916 static UBool U_CALLCONV
4917 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4918     progressCallBackContext  *info = (progressCallBackContext *)context;
4919     info->numCalls++;
4920     info->lastIndex = matchIndex;
4921 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4922     return (info->numCalls < info->maxCalls);
4923 }
4924 U_CDECL_END
4925
4926 void RegexTest::FindProgressCallbacks() {
4927    {
4928         // Getter returns NULLs if no callback has been set
4929
4930         //   The variables that the getter will fill in.
4931         //   Init to non-null values so that the action of the getter can be seen.
4932         const void                  *returnedContext = &returnedContext;
4933         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4934
4935         UErrorCode status = U_ZERO_ERROR;
4936         RegexMatcher matcher("x", 0, status);
4937         REGEX_CHECK_STATUS;
4938         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4939         REGEX_CHECK_STATUS;
4940         REGEX_ASSERT(returnedFn == NULL);
4941         REGEX_ASSERT(returnedContext == NULL);
4942     }
4943
4944    {
4945         // Set and Get work
4946         progressCallBackContext cbInfo = {this, 0, 0, 0};
4947         const void                  *returnedContext;
4948         URegexFindProgressCallback  *returnedFn;
4949         UErrorCode status = U_ZERO_ERROR;
4950         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4951         REGEX_CHECK_STATUS;
4952         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4953         REGEX_CHECK_STATUS;
4954         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4955         REGEX_CHECK_STATUS;
4956         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4957         REGEX_ASSERT(returnedContext == &cbInfo);
4958
4959         // A find that matches on the initial position does NOT invoke the callback.
4960         status = U_ZERO_ERROR;
4961         cbInfo.reset(100);
4962         UnicodeString s = "aaxxx";
4963         matcher.reset(s);
4964 #if 0
4965         matcher.setTrace(TRUE);
4966 #endif
4967         REGEX_ASSERT(matcher.find(0, status));
4968         REGEX_CHECK_STATUS;
4969         REGEX_ASSERT(cbInfo.numCalls == 0);
4970
4971         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4972         //   but not so many times that we interrupt the operation.
4973         status = U_ZERO_ERROR;
4974         s = "aaaaaaaaaaaaaaaaaaab";
4975         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4976         matcher.reset(s);
4977         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4978         REGEX_CHECK_STATUS;
4979         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4980
4981         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4982         status = U_ZERO_ERROR;
4983         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4984         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4985         matcher.reset(s1);
4986         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4987         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4988         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4989
4990         // Now a match that will succeed, but after an interruption
4991         status = U_ZERO_ERROR;
4992         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4993         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4994         matcher.reset(s2);
4995         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4996         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4997         // Now retry the match from where left off
4998         cbInfo.maxCalls = 100; //  No callback limit
4999         status = U_ZERO_ERROR;
5000         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5001         REGEX_CHECK_STATUS;
5002     }
5003
5004
5005 }
5006
5007
5008 //---------------------------------------------------------------------------
5009 //
5010 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5011 //                             UTexts. The pure-C implementation of UText
5012 //                             has no mutable backing stores, but we can
5013 //                             use UnicodeString here to test the functionality.
5014 //
5015 //---------------------------------------------------------------------------
5016 void RegexTest::PreAllocatedUTextCAPI () {
5017     UErrorCode           status = U_ZERO_ERROR;
5018     URegularExpression  *re;
5019     UText                patternText = UTEXT_INITIALIZER;
5020     UnicodeString        buffer;
5021     UText                bufferText = UTEXT_INITIALIZER;
5022
5023     utext_openUnicodeString(&bufferText, &buffer, &status);
5024
5025     /*
5026      *  getText() and getUText()
5027      */
5028     {
5029         UText  text1 = UTEXT_INITIALIZER;
5030         UText  text2 = UTEXT_INITIALIZER;
5031         UChar  text2Chars[20];
5032         UText  *resultText;
5033
5034         status = U_ZERO_ERROR;
5035         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5036         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5037         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5038         utext_openUChars(&text2, text2Chars, -1, &status);
5039
5040         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5041         re = uregex_openUText(&patternText, 0, NULL, &status);
5042
5043         /* First set a UText */
5044         uregex_setUText(re, &text1, &status);
5045         resultText = uregex_getUText(re, &bufferText, &status);
5046         REGEX_CHECK_STATUS;
5047         REGEX_ASSERT(resultText == &bufferText);
5048         utext_setNativeIndex(resultText, 0);
5049         utext_setNativeIndex(&text1, 0);
5050         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5051
5052         resultText = uregex_getUText(re, &bufferText, &status);
5053         REGEX_CHECK_STATUS;
5054         REGEX_ASSERT(resultText == &bufferText);
5055         utext_setNativeIndex(resultText, 0);
5056         utext_setNativeIndex(&text1, 0);
5057         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5058
5059         /* Then set a UChar * */
5060         uregex_setText(re, text2Chars, 7, &status);
5061         resultText = uregex_getUText(re, &bufferText, &status);
5062         REGEX_CHECK_STATUS;
5063         REGEX_ASSERT(resultText == &bufferText);
5064         utext_setNativeIndex(resultText, 0);
5065         utext_setNativeIndex(&text2, 0);
5066         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5067
5068         uregex_close(re);
5069         utext_close(&text1);
5070         utext_close(&text2);
5071     }
5072
5073     /*
5074      *  group()
5075      */
5076     {
5077         UChar    text1[80];
5078         UText   *actual;
5079         UBool    result;
5080         int64_t  length = 0;
5081
5082         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5083         //                  012345678901234567890123456789012345678901234567
5084         //                  0         1         2         3         4
5085
5086         status = U_ZERO_ERROR;
5087         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5088         REGEX_CHECK_STATUS;
5089
5090         uregex_setText(re, text1, -1, &status);
5091         result = uregex_find(re, 0, &status);
5092         REGEX_ASSERT(result==TRUE);
5093
5094         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5095         status = U_ZERO_ERROR;
5096         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5097         REGEX_CHECK_STATUS;
5098         REGEX_ASSERT(actual == &bufferText);
5099         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5100         REGEX_ASSERT(length == 16);
5101         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5102
5103         /*  Capture group #1.  Should succeed, matching " interior ". */
5104         status = U_ZERO_ERROR;
5105         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5106         REGEX_CHECK_STATUS;
5107         REGEX_ASSERT(actual == &bufferText);
5108         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5109         REGEX_ASSERT(length == 10);
5110         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5111
5112         /*  Capture group out of range.  Error. */
5113         status = U_ZERO_ERROR;
5114         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5115         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5116         REGEX_ASSERT(actual == &bufferText);
5117         uregex_close(re);
5118
5119     }
5120
5121     /*
5122      *  replaceFirst()
5123      */
5124     {
5125         UChar    text1[80];
5126         UChar    text2[80];
5127         UText    replText = UTEXT_INITIALIZER;
5128         UText   *result;
5129         status = U_ZERO_ERROR;
5130         utext_openUnicodeString(&bufferText, &buffer, &status);
5131
5132         status = U_ZERO_ERROR;
5133         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5134         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5135         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5136
5137         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5138         REGEX_CHECK_STATUS;
5139
5140         /*  Normal case, with match */
5141         uregex_setText(re, text1, -1, &status);
5142         REGEX_CHECK_STATUS;
5143         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5144         REGEX_CHECK_STATUS;
5145         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5146         REGEX_CHECK_STATUS;
5147         REGEX_ASSERT(result == &bufferText);
5148         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5149
5150         /* No match.  Text should copy to output with no changes.  */
5151         uregex_setText(re, text2, -1, &status);
5152         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5153         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5154         REGEX_CHECK_STATUS;
5155         REGEX_ASSERT(result == &bufferText);
5156         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5157
5158         /* Unicode escapes */
5159         uregex_setText(re, text1, -1, &status);
5160         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5161         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5162         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5163         REGEX_CHECK_STATUS;
5164         REGEX_ASSERT(result == &bufferText);
5165         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5166
5167         uregex_close(re);
5168         utext_close(&replText);
5169     }
5170
5171
5172     /*
5173      *  replaceAll()
5174      */
5175     {
5176         UChar    text1[80];
5177         UChar    text2[80];
5178         UText    replText = UTEXT_INITIALIZER;
5179         UText   *result;
5180
5181         status = U_ZERO_ERROR;
5182         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5183         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5184         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5185
5186         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5187         REGEX_CHECK_STATUS;
5188
5189         /*  Normal case, with match */
5190         uregex_setText(re, text1, -1, &status);
5191         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5192         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5193         REGEX_CHECK_STATUS;
5194         REGEX_ASSERT(result == &bufferText);
5195         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5196
5197         /* No match.  Text should copy to output with no changes.  */
5198         uregex_setText(re, text2, -1, &status);
5199         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5200         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5201         REGEX_CHECK_STATUS;
5202         REGEX_ASSERT(result == &bufferText);
5203         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5204
5205         uregex_close(re);
5206         utext_close(&replText);
5207     }
5208
5209
5210     /*
5211      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5212      *   so we don't need to test it here.
5213      */
5214
5215     utext_close(&bufferText);
5216     utext_close(&patternText);
5217 }
5218
5219
5220 //--------------------------------------------------------------
5221 //
5222 //  NamedCapture   Check basic named capture group functionality
5223 //
5224 //--------------------------------------------------------------
5225 void RegexTest::NamedCapture() {
5226     UErrorCode status = U_ZERO_ERROR;
5227     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5228             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5229     REGEX_CHECK_STATUS;
5230     int32_t group = pat->groupNumberFromName("five", -1, status);
5231     REGEX_CHECK_STATUS;
5232     REGEX_ASSERT(5 == group);
5233     group = pat->groupNumberFromName("three", -1, status);
5234     REGEX_CHECK_STATUS;
5235     REGEX_ASSERT(3 == group);
5236
5237     status = U_ZERO_ERROR;
5238     group = pat->groupNumberFromName(UnicodeString("six"), status);
5239     REGEX_CHECK_STATUS;
5240     REGEX_ASSERT(6 == group);
5241
5242     status = U_ZERO_ERROR;
5243     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5244     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5245
5246     status = U_ZERO_ERROR;
5247
5248     // After copying a pattern, named capture should still work in the copy.
5249     RegexPattern *copiedPat = new RegexPattern(*pat);
5250     REGEX_ASSERT(*copiedPat == *pat);
5251     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5252
5253     group = copiedPat->groupNumberFromName("five", -1, status);
5254     REGEX_CHECK_STATUS;
5255     REGEX_ASSERT(5 == group);
5256     group = copiedPat->groupNumberFromName("three", -1, status);
5257     REGEX_CHECK_STATUS;
5258     REGEX_ASSERT(3 == group);
5259     delete copiedPat;
5260
5261     // ReplaceAll with named capture group.
5262     status = U_ZERO_ERROR;
5263     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5264     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5265     REGEX_CHECK_STATUS;
5266     // m.pattern().dumpPattern();
5267     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5268     REGEX_CHECK_STATUS;
5269     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5270     delete m;
5271
5272     // ReplaceAll, allowed capture group numbers.
5273     text = UnicodeString("abcmxyz");
5274     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5275     REGEX_CHECK_STATUS;
5276
5277     status = U_ZERO_ERROR;
5278     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5279     REGEX_CHECK_STATUS;
5280     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5281
5282     status = U_ZERO_ERROR;
5283     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5284     REGEX_CHECK_STATUS;
5285     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5286
5287     status = U_ZERO_ERROR;
5288     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5289     REGEX_CHECK_STATUS;
5290     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5291
5292     status = U_ZERO_ERROR;
5293     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5294     REGEX_CHECK_STATUS;
5295     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5296
5297     status = U_ZERO_ERROR;
5298     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5299     REGEX_CHECK_STATUS;
5300     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5301
5302     status = U_ZERO_ERROR;
5303     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5304     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5305
5306     status = U_ZERO_ERROR;
5307     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5308     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5309     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5310
5311     status = U_ZERO_ERROR;
5312     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5313     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5314     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5315
5316     status = U_ZERO_ERROR;
5317     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5318     REGEX_CHECK_STATUS;
5319     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5320
5321     status = U_ZERO_ERROR;
5322     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5323     REGEX_CHECK_STATUS;
5324     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5325
5326     status = U_ZERO_ERROR;
5327     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5328     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5329
5330     status = U_ZERO_ERROR;
5331     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5332     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5333
5334     status = U_ZERO_ERROR;
5335     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5336     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5337
5338     status = U_ZERO_ERROR;
5339     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5340     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5341
5342     delete m;
5343
5344     // Repeat the above replaceAll() tests using the plain C API, which
5345     //  has a separate implementation internally.
5346     //  TODO: factor out the test data.
5347
5348     status = U_ZERO_ERROR;
5349     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5350     REGEX_CHECK_STATUS;
5351     text = UnicodeString("abcmxyz");
5352     uregex_setText(re, text.getBuffer(), text.length(), &status);
5353     REGEX_CHECK_STATUS;
5354
5355     UChar resultBuf[100];
5356     int32_t resultLength;
5357     UnicodeString repl;
5358
5359     status = U_ZERO_ERROR;
5360     repl = UnicodeString("<$0>");
5361     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5362     REGEX_CHECK_STATUS;
5363     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5364
5365     status = U_ZERO_ERROR;
5366     repl = UnicodeString("<$1>");
5367     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5368     REGEX_CHECK_STATUS;
5369     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5370
5371     status = U_ZERO_ERROR;
5372     repl = UnicodeString("<${one}>");
5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374     REGEX_CHECK_STATUS;
5375     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5376
5377     status = U_ZERO_ERROR;
5378     repl = UnicodeString("<$2>");
5379     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380     REGEX_CHECK_STATUS;
5381     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5382
5383     status = U_ZERO_ERROR;
5384     repl = UnicodeString("<$3>");
5385     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386     REGEX_CHECK_STATUS;
5387     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5388
5389     status = U_ZERO_ERROR;
5390     repl = UnicodeString("<$4>");
5391     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5392     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5393
5394     status = U_ZERO_ERROR;
5395     repl = UnicodeString("<$04>");
5396     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5397     REGEX_CHECK_STATUS;
5398     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5399
5400     status = U_ZERO_ERROR;
5401     repl = UnicodeString("<$000016>");
5402     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5403     REGEX_CHECK_STATUS;
5404     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5405
5406     status = U_ZERO_ERROR;
5407     repl = UnicodeString("<$3$2$1${one}>");
5408     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409     REGEX_CHECK_STATUS;
5410     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5411
5412     status = U_ZERO_ERROR;
5413     repl = UnicodeString("$3$2$1${one}");
5414     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5415     REGEX_CHECK_STATUS;
5416     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5417
5418     status = U_ZERO_ERROR;
5419     repl = UnicodeString("<${noSuchName}>");
5420     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5421     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5422
5423     status = U_ZERO_ERROR;
5424     repl = UnicodeString("<${invalid-name}>");
5425     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5426     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5427
5428     status = U_ZERO_ERROR;
5429     repl = UnicodeString("<${one");
5430     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5431     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5432
5433     status = U_ZERO_ERROR;
5434     repl = UnicodeString("$not a capture group");
5435     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5436     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5437
5438     uregex_close(re);
5439 }
5440
5441 //--------------------------------------------------------------
5442 //
5443 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5444 //                       The point is not so much what the exact limit is,
5445 //                       but that a largish number doesn't hit bad non-linear performance,
5446 //                       and that exceeding the limit fails cleanly.
5447 //
5448 //--------------------------------------------------------------
5449 void RegexTest::NamedCaptureLimits() {
5450     if (quick) {
5451         logln("Skipping test. Runs in exhuastive mode only.");
5452         return;
5453     }
5454     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5455     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5456     char nnbuf[100];
5457     UnicodeString pattern;
5458     int32_t nn;
5459
5460     for (nn=1; nn<goodLimit; nn++) {
5461         sprintf(nnbuf, "(?<nn%d>)", nn);
5462         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5463     }
5464     UErrorCode status = U_ZERO_ERROR;
5465     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5466     REGEX_CHECK_STATUS;
5467     for (nn=1; nn<goodLimit; nn++) {
5468         sprintf(nnbuf, "nn%d", nn);
5469         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5470         REGEX_ASSERT(nn == groupNum);
5471         if (nn != groupNum) {
5472             break;
5473         }
5474     }
5475     delete pat;
5476
5477     pattern.remove();
5478     for (nn=1; nn<failLimit; nn++) {
5479         sprintf(nnbuf, "(?<nn%d>)", nn);
5480         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5481     }
5482     status = U_ZERO_ERROR;
5483     pat = RegexPattern::compile(pattern, 0, status);
5484     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5485     delete pat;
5486 }
5487
5488
5489 //--------------------------------------------------------------
5490 //
5491 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5492 //
5493 //---------------------------------------------------------------
5494 void RegexTest::Bug7651() {
5495     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5496     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5497     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5498     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5499     UnicodeString s("#ff @abcd This is test");
5500     RegexPattern  *REPattern = NULL;
5501     RegexMatcher  *REMatcher = NULL;
5502     UErrorCode status = U_ZERO_ERROR;
5503     UParseError pe;
5504
5505     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5506     REGEX_CHECK_STATUS;
5507     REMatcher = REPattern->matcher(s, status);
5508     REGEX_CHECK_STATUS;
5509     REGEX_ASSERT(REMatcher->find());
5510     REGEX_ASSERT(REMatcher->start(status) == 0);
5511     delete REPattern;
5512     delete REMatcher;
5513     status = U_ZERO_ERROR;
5514
5515     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5516     REGEX_CHECK_STATUS;
5517     REMatcher = REPattern->matcher(s, status);
5518     REGEX_CHECK_STATUS;
5519     REGEX_ASSERT(REMatcher->find());
5520     REGEX_ASSERT(REMatcher->start(status) == 0);
5521     delete REPattern;
5522     delete REMatcher;
5523     status = U_ZERO_ERROR;
5524  }
5525
5526 void RegexTest::Bug7740() {
5527     UErrorCode status = U_ZERO_ERROR;
5528     UnicodeString pattern = "(a)";
5529     UnicodeString text = "abcdef";
5530     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5531     REGEX_CHECK_STATUS;
5532     REGEX_ASSERT(m->lookingAt(status));
5533     REGEX_CHECK_STATUS;
5534     status = U_ILLEGAL_ARGUMENT_ERROR;
5535     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5536     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5537     REGEX_ASSERT(s == "");
5538     delete m;
5539 }
5540
5541 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5542
5543 void RegexTest::Bug8479() {
5544     UErrorCode status = U_ZERO_ERROR;
5545
5546     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5547     REGEX_CHECK_STATUS;
5548     if (U_SUCCESS(status))
5549     {
5550         UnicodeString str;
5551         str.setToBogus();
5552         pMatcher->reset(str);
5553         status = U_ZERO_ERROR;
5554         pMatcher->matches(status);
5555         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5556         delete pMatcher;
5557     }
5558 }
5559
5560
5561 // Bug 7029
5562 void RegexTest::Bug7029() {
5563     UErrorCode status = U_ZERO_ERROR;
5564
5565     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5566     UnicodeString text = "abc.def";
5567     UnicodeString splits[10];
5568     REGEX_CHECK_STATUS;
5569     int32_t numFields = pMatcher->split(text, splits, 10, status);
5570     REGEX_CHECK_STATUS;
5571     REGEX_ASSERT(numFields == 8);
5572     delete pMatcher;
5573 }
5574
5575 // Bug 9283
5576 //   This test is checking for the existance of any supplemental characters that case-fold
5577 //   to a bmp character.
5578 //
5579 //   At the time of this writing there are none. If any should appear in a subsequent release
5580 //   of Unicode, the code in regular expressions compilation that determines the longest
5581 //   posssible match for a literal string  will need to be enhanced.
5582 //
5583 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5584 //   for details on what to do in case of a failure of this test.
5585 //
5586 void RegexTest::Bug9283() {
5587 #if !UCONFIG_NO_NORMALIZATION
5588     UErrorCode status = U_ZERO_ERROR;
5589     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5590     REGEX_CHECK_STATUS;
5591     int32_t index;
5592     UChar32 c;
5593     for (index=0; ; index++) {
5594         c = supplementalsWithCaseFolding.charAt(index);
5595         if (c == -1) {
5596             break;
5597         }
5598         UnicodeString cf = UnicodeString(c).foldCase();
5599         REGEX_ASSERT(cf.length() >= 2);
5600     }
5601 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5602 }
5603
5604
5605 void RegexTest::CheckInvBufSize() {
5606   if(inv_next>=INV_BUFSIZ) {
5607     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5608           __FILE__, INV_BUFSIZ, inv_next);
5609   } else {
5610     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5611   }
5612 }
5613
5614
5615 void RegexTest::Bug10459() {
5616     UErrorCode status = U_ZERO_ERROR;
5617     UnicodeString patternString("(txt)");
5618     UnicodeString txtString("txt");
5619
5620     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5621     REGEX_CHECK_STATUS;
5622     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5623     REGEX_CHECK_STATUS;
5624
5625     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5626     REGEX_CHECK_STATUS;
5627
5628     uregex_setUText(icu_re, utext_txt, &status);
5629     REGEX_CHECK_STATUS;
5630
5631     // The bug was that calling uregex_group() before doing a matching operation
5632     //   was causing a segfault. Only for Regular Expressions created from UText.
5633     //   It should set an U_REGEX_INVALID_STATE.
5634
5635     UChar buf[100];
5636     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5637     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5638     REGEX_ASSERT(len == 0);
5639
5640     uregex_close(icu_re);
5641     utext_close(utext_pat);
5642     utext_close(utext_txt);
5643 }
5644
5645 void RegexTest::TestCaseInsensitiveStarters() {
5646     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5647     //  become stale because of new Unicode characters.
5648     // If it is stale, rerun the generation tool
5649     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5650     // and replace the embedded data in i18n/regexcmp.cpp
5651
5652     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5653         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5654             continue;
5655         }
5656         UnicodeSet s(cp, cp);
5657         s.closeOver(USET_CASE_INSENSITIVE);
5658         UnicodeSetIterator setIter(s);
5659         while (setIter.next()) {
5660             if (!setIter.isString()) {
5661                 continue;
5662             }
5663             const UnicodeString &str = setIter.getString();
5664             UChar32 firstChar = str.char32At(0);
5665             UnicodeSet starters;
5666             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5667             if (!starters.contains(cp)) {
5668                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5669                 return;
5670             }
5671         }
5672     }
5673 }
5674
5675
5676 void RegexTest::TestBug11049() {
5677     // Original bug report: pattern with match start consisting of one of several individual characters,
5678     //  and the text being matched ending with a supplementary character. find() would read past the
5679     //  end of the input text when searching for potential match starting points.
5680
5681     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5682     // detect the bad read.
5683
5684     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5685     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5686
5687     // Test again with a pattern starting with a single character,
5688     // which takes a different code path than starting with an OR expression,
5689     // but with similar logic.
5690     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5691     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5692 }
5693
5694 // Run a single test case from TestBug11049(). Internal function.
5695 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5696     UErrorCode status = U_ZERO_ERROR;
5697     UnicodeString patternString = UnicodeString(pattern).unescape();
5698     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5699
5700     UnicodeString dataString = UnicodeString(data).unescape();
5701     UChar *exactBuffer = new UChar[dataString.length()];
5702     dataString.extract(exactBuffer, dataString.length(), status);
5703     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5704
5705     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5706     REGEX_CHECK_STATUS;
5707     matcher->reset(ut);
5708     UBool result = matcher->find();
5709     if (result != expectMatch) {
5710         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5711               __FILE__, lineNumber, expectMatch, result, pattern, data);
5712     }
5713
5714     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5715     //   off-by-one on find() with match at the last code point.
5716     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5717     //   because string.unescape() will only shrink it.
5718     char * utf8Buffer = new char[uprv_strlen(data)+1];
5719     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5720     REGEX_CHECK_STATUS;
5721     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5722     REGEX_CHECK_STATUS;
5723     matcher->reset(ut);
5724     result = matcher->find();
5725     if (result != expectMatch) {
5726         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5727               __FILE__, lineNumber, expectMatch, result, pattern, data);
5728     }
5729     delete [] utf8Buffer;
5730
5731     utext_close(ut);
5732     delete [] exactBuffer;
5733 }
5734
5735
5736 void RegexTest::TestBug11371() {
5737     if (quick) {
5738         logln("Skipping test. Runs in exhuastive mode only.");
5739         return;
5740     }
5741     UErrorCode status = U_ZERO_ERROR;
5742     UnicodeString patternString;
5743
5744     for (int i=0; i<8000000; i++) {
5745         patternString.append(UnicodeString("()"));
5746     }
5747     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5748     if (status != U_REGEX_PATTERN_TOO_BIG) {
5749         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5750               __FILE__, __LINE__, u_errorName(status));
5751     }
5752
5753     status = U_ZERO_ERROR;
5754     patternString = "(";
5755     for (int i=0; i<20000000; i++) {
5756         patternString.append(UnicodeString("A++"));
5757     }
5758     patternString.append(UnicodeString("){0}B++"));
5759     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5760     if (status != U_REGEX_PATTERN_TOO_BIG) {
5761         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762               __FILE__, __LINE__, u_errorName(status));
5763     }
5764
5765     // Pattern with too much string data, such that string indexes overflow operand data field size
5766     // in compiled instruction.
5767     status = U_ZERO_ERROR;
5768     patternString = "";
5769     while (patternString.length() < 0x00ffffff) {
5770         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5771     }
5772     patternString.append(UnicodeString("X? trailing string"));
5773     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5774     if (status != U_REGEX_PATTERN_TOO_BIG) {
5775         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5776               __FILE__, __LINE__, u_errorName(status));
5777     }
5778 }
5779
5780 void RegexTest::TestBug11480() {
5781     // C API, get capture group of a group that does not participate in the match.
5782     //        (Returns a zero length string, with nul termination,
5783     //         indistinguishable from a group with a zero lenght match.)
5784
5785     UErrorCode status = U_ZERO_ERROR;
5786     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5787     REGEX_CHECK_STATUS;
5788     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5789     uregex_setText(re, text.getBuffer(), text.length(), &status);
5790     REGEX_CHECK_STATUS;
5791     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5792     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5793     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5794     REGEX_ASSERT(length == 0);
5795     REGEX_ASSERT(buf[0] == 13);
5796     REGEX_ASSERT(buf[1] == 0);
5797     REGEX_ASSERT(buf[2] == 13);
5798     uregex_close(re);
5799 }
5800
5801
5802 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */