icuSources/test/intltest/regextst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 2002-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8
   9 //
  10 //   regextst.cpp
  11 //
  12 //      ICU Regular Expressions test, part of intltest.
  13 //
  14
  15 /*
  16      NOTE!!
  17
  18      PLEASE be careful about ASCII assumptions in this test.
  19      This test is one of the worst repeat offenders.
  20      If you have questions, contact someone on the ICU PMC
  21      who has access to an EBCDIC system.
  22
  23  */
  24
  25 #include "intltest.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31
  32 #include "unicode/localpointer.h"
  33 #include "unicode/regex.h"
  34 #include "unicode/uchar.h"
  35 #include "unicode/ucnv.h"
  36 #include "unicode/uniset.h"
  37 #include "unicode/uregex.h"
  38 #include "unicode/usetiter.h"
  39 #include "unicode/ustring.h"
  40 #include "unicode/utext.h"
  41 #include "unicode/utf16.h"
  42 #include "cstr.h"
  43 #include "regextst.h"
  44 #include "regexcmp.h"
  45 #include "uvector.h"
  46 #include "util.h"
  47 #include "cmemory.h"
  48 #include "cstring.h"
  49 #include "uinvchar.h"
  50
  51 #define SUPPORT_MUTATING_INPUT_STRING   0
  52
  53 //---------------------------------------------------------------------------
  54 //
  55 //  Test class boilerplate
  56 //
  57 //---------------------------------------------------------------------------
  58 RegexTest::RegexTest()
  59 {
  60 }
  61
  62
  63 RegexTest::~RegexTest()
  64 {
  65 }
  66
  67
  68
  69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  70 {
  71     if (exec) logln("TestSuite RegexTest: ");
  72     TESTCASE_AUTO_BEGIN;
  73     TESTCASE_AUTO(Basic);
  74     TESTCASE_AUTO(API_Match);
  75     TESTCASE_AUTO(API_Replace);
  76     TESTCASE_AUTO(API_Pattern);
  77 #if !UCONFIG_NO_FILE_IO
  78     TESTCASE_AUTO(Extended);
  79 #endif
  80     TESTCASE_AUTO(Errors);
  81     TESTCASE_AUTO(PerlTests);
  82     TESTCASE_AUTO(Callbacks);
  83     TESTCASE_AUTO(FindProgressCallbacks);
  84     TESTCASE_AUTO(Bug6149);
  85     TESTCASE_AUTO(UTextBasic);
  86     TESTCASE_AUTO(API_Match_UTF8);
  87     TESTCASE_AUTO(API_Replace_UTF8);
  88     TESTCASE_AUTO(API_Pattern_UTF8);
  89     TESTCASE_AUTO(PerlTestsUTF8);
  90     TESTCASE_AUTO(PreAllocatedUTextCAPI);
  91     TESTCASE_AUTO(Bug7651);
  92     TESTCASE_AUTO(Bug7740);
  93     TESTCASE_AUTO(Bug8479);
  94     TESTCASE_AUTO(Bug7029);
  95     TESTCASE_AUTO(CheckInvBufSize);
  96     TESTCASE_AUTO(Bug9283);
  97     TESTCASE_AUTO(Bug10459);
  98     TESTCASE_AUTO(TestCaseInsensitiveStarters);
  99     TESTCASE_AUTO(TestBug11049);
 100     TESTCASE_AUTO(TestBug11371);
 101     TESTCASE_AUTO(TestBug11480);
 102     TESTCASE_AUTO(NamedCapture);
 103     TESTCASE_AUTO(NamedCaptureLimits);
 104     TESTCASE_AUTO(TestBug12884);
 105     TESTCASE_AUTO(TestBug13631);
 106     TESTCASE_AUTO(TestBug13632);
 107     TESTCASE_AUTO(TestBug20359);
 108     TESTCASE_AUTO_END;
 109 }
 110
 111
 112 /**
 113  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 114  * into ASCII.
 115  * @see utext_openUTF8
 116  */
 117 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 118
 119 //---------------------------------------------------------------------------
 120 //
 121 //   Error Checking / Reporting macros used in all of the tests.
 122 //
 123 //---------------------------------------------------------------------------
 124
 125 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 126   int64_t oldIndex = utext_getNativeIndex(text);
 127   utext_setNativeIndex(text, 0);
 128   char *bufPtr = buf;
 129   UChar32 c = utext_next32From(text, 0);
 130   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 131     if (0x000020<=c && c<0x00007e) {
 132       *bufPtr = c;
 133     } else {
 134 #if 0
 135       sprintf(bufPtr,"U+%04X", c);
 136       bufPtr+= strlen(bufPtr)-1;
 137 #else
 138       *bufPtr = '%';
 139 #endif
 140     }
 141     bufPtr++;
 142     c = UTEXT_NEXT32(text);
 143   }
 144   *bufPtr = 0;
 145 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 146   char *ebuf = (char*)malloc(bufLen);
 147   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 148   uprv_strncpy(buf, ebuf, bufLen);
 149   free((void*)ebuf);
 150 #endif
 151   utext_setNativeIndex(text, oldIndex);
 152 }
 153
 154
 155 static char ASSERT_BUF[1024];
 156
 157 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 158   if(message.length()==0) {
 159     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 160   } else {
 161     UnicodeString buf;
 162     IntlTest::prettify(message,buf);
 163     if(buf.length()==0) {
 164       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 165     } else {
 166       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 167       if(ASSERT_BUF[0]==0) {
 168         ASSERT_BUF[0]=0;
 169         for(int32_t i=0;i<buf.length();i++) {
 170           UChar ch = buf[i];
 171           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 172         }
 173       }
 174     }
 175   }
 176   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 177   return ASSERT_BUF;
 178 }
 179
 180 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 181
 182 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 183                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 184
 185 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 186
 187 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 188 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 189     __LINE__, u_errorName(errcode), u_errorName(status));};}
 190
 191 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 192     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 193
 194 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 195     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 196
 197 // expected: const char * , restricted to invariant characters.
 198 // actual: const UnicodeString &
 199 #define REGEX_ASSERT_UNISTR(expected, actual) { \
 200     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
 201         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
 202                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
 203
 204
 205 static UBool testUTextEqual(UText *uta, UText *utb) {
 206     UChar32 ca = 0;
 207     UChar32 cb = 0;
 208     utext_setNativeIndex(uta, 0);
 209     utext_setNativeIndex(utb, 0);
 210     do {
 211         ca = utext_next32(uta);
 212         cb = utext_next32(utb);
 213         if (ca != cb) {
 214             break;
 215         }
 216     } while (ca != U_SENTINEL);
 217     return ca == cb;
 218 }
 219
 220
 221 /**
 222  * @param expected expected text in UTF-8 (not platform) codepage
 223  */
 224 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 225     UErrorCode status = U_ZERO_ERROR;
 226     UText expectedText = UTEXT_INITIALIZER;
 227     utext_openUTF8(&expectedText, expected, -1, &status);
 228     if(U_FAILURE(status)) {
 229       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 230       return;
 231     }
 232     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 233       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 234       return;
 235     }
 236     utext_setNativeIndex(actual, 0);
 237     if (!testUTextEqual(&expectedText, actual)) {
 238         char buf[201 /*21*/];
 239         char expectedBuf[201];
 240         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 241         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 242         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 243     }
 244     utext_close(&expectedText);
 245 }
 246 /**
 247  * @param expected invariant (platform local text) input
 248  */
 249
 250 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 251     UErrorCode status = U_ZERO_ERROR;
 252     UText expectedText = UTEXT_INITIALIZER;
 253     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 254     if(U_FAILURE(status)) {
 255       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 256       return;
 257     }
 258     utext_setNativeIndex(actual, 0);
 259     if (!testUTextEqual(&expectedText, actual)) {
 260         char buf[201 /*21*/];
 261         char expectedBuf[201];
 262         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 263         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 264         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 265     }
 266     utext_close(&expectedText);
 267 }
 268
 269 /**
 270  * Assumes utf-8 input
 271  */
 272 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 273 /**
 274  * Assumes Invariant input
 275  */
 276 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 277
 278 /**
 279  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 280  * passed into utext_openUTF8. An error will be given if
 281  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 282  */
 283
 284 #define INV_BUFSIZ 2048 /* increase this if too small */
 285
 286 static int64_t inv_next=0;
 287
 288 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 289 static char inv_buf[INV_BUFSIZ];
 290 #endif
 291
 292 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 293   if(length==-1) length=strlen(inv);
 294 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 295   inv_next+=length;
 296   return utext_openUTF8(ut, inv, length, status);
 297 #else
 298   if(inv_next+length+1>INV_BUFSIZ) {
 299     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 300             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 301     *status = U_MEMORY_ALLOCATION_ERROR;
 302     return NULL;
 303   }
 304
 305   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 306   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 307   inv_next+=length;
 308
 309 #if 0
 310   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 311 #endif
 312
 313   return utext_openUTF8(ut, (const char*)buf, length, status);
 314 #endif
 315 }
 316
 317
 318 //---------------------------------------------------------------------------
 319 //
 320 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 321 //                       for the LookingAt() and  Match() functions.
 322 //
 323 //       usage:
 324 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 325 //
 326 //          The expected results are UBool - TRUE or FALSE.
 327 //          The input text is unescaped.  The pattern is not.
 328 //
 329 //
 330 //---------------------------------------------------------------------------
 331
 332 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 333
 334 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 335     const UnicodeString pattern(pat, -1, US_INV);
 336     const UnicodeString inputText(text, -1, US_INV);
 337     UErrorCode          status  = U_ZERO_ERROR;
 338     UParseError         pe;
 339     RegexPattern        *REPattern = NULL;
 340     RegexMatcher        *REMatcher = NULL;
 341     UBool               retVal     = TRUE;
 342
 343     UnicodeString patString(pat, -1, US_INV);
 344     REPattern = RegexPattern::compile(patString, 0, pe, status);
 345     if (U_FAILURE(status)) {
 346         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 347             line, u_errorName(status));
 348         return FALSE;
 349     }
 350     if (line==376) { REPattern->dumpPattern();}
 351
 352     UnicodeString inputString(inputText);
 353     UnicodeString unEscapedInput = inputString.unescape();
 354     REMatcher = REPattern->matcher(unEscapedInput, status);
 355     if (U_FAILURE(status)) {
 356         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 357             line, u_errorName(status));
 358         return FALSE;
 359     }
 360
 361     UBool actualmatch;
 362     actualmatch = REMatcher->lookingAt(status);
 363     if (U_FAILURE(status)) {
 364         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 365             line, u_errorName(status));
 366         retVal =  FALSE;
 367     }
 368     if (actualmatch != looking) {
 369         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 370         retVal = FALSE;
 371     }
 372
 373     status = U_ZERO_ERROR;
 374     actualmatch = REMatcher->matches(status);
 375     if (U_FAILURE(status)) {
 376         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 377             line, u_errorName(status));
 378         retVal = FALSE;
 379     }
 380     if (actualmatch != match) {
 381         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 382         retVal = FALSE;
 383     }
 384
 385     if (retVal == FALSE) {
 386         REPattern->dumpPattern();
 387     }
 388
 389     delete REPattern;
 390     delete REMatcher;
 391     return retVal;
 392 }
 393
 394
 395 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 396     UText               pattern    = UTEXT_INITIALIZER;
 397     int32_t             inputUTF8Length;
 398     char                *textChars = NULL;
 399     UText               inputText  = UTEXT_INITIALIZER;
 400     UErrorCode          status     = U_ZERO_ERROR;
 401     UParseError         pe;
 402     RegexPattern        *REPattern = NULL;
 403     RegexMatcher        *REMatcher = NULL;
 404     UBool               retVal     = TRUE;
 405
 406     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 407     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 408     if (U_FAILURE(status)) {
 409         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 410             line, u_errorName(status));
 411         return FALSE;
 412     }
 413
 414     UnicodeString inputString(text, -1, US_INV);
 415     UnicodeString unEscapedInput = inputString.unescape();
 416     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 417     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 418
 419     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 420     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 421         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 422         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 423         return TRUE; // not a failure of the Regex engine
 424     }
 425     status = U_ZERO_ERROR; // buffer overflow
 426     textChars = new char[inputUTF8Length+1];
 427     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 428     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 429
 430     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 431     if (U_FAILURE(status)) {
 432         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 433             line, u_errorName(status));
 434         return FALSE;
 435     }
 436
 437     UBool actualmatch;
 438     actualmatch = REMatcher->lookingAt(status);
 439     if (U_FAILURE(status)) {
 440         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 441             line, u_errorName(status));
 442         retVal =  FALSE;
 443     }
 444     if (actualmatch != looking) {
 445         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 446         retVal = FALSE;
 447     }
 448
 449     status = U_ZERO_ERROR;
 450     actualmatch = REMatcher->matches(status);
 451     if (U_FAILURE(status)) {
 452         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 453             line, u_errorName(status));
 454         retVal = FALSE;
 455     }
 456     if (actualmatch != match) {
 457         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 458         retVal = FALSE;
 459     }
 460
 461     if (retVal == FALSE) {
 462         REPattern->dumpPattern();
 463     }
 464
 465     delete REPattern;
 466     delete REMatcher;
 467     utext_close(&inputText);
 468     utext_close(&pattern);
 469     delete[] textChars;
 470     return retVal;
 471 }
 472
 473
 474
 475 //---------------------------------------------------------------------------
 476 //
 477 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 478 //                       regex tests for incorrect patterns
 479 //
 480 //       usage:
 481 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 482 //
 483 //---------------------------------------------------------------------------
 484 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 485
 486 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 487                           UErrorCode expectedStatus, int32_t line) {
 488     UnicodeString       pattern(pat);
 489
 490     UErrorCode          status         = U_ZERO_ERROR;
 491     UParseError         pe;
 492     RegexPattern        *callerPattern = NULL;
 493
 494     //
 495     //  Compile the caller's pattern
 496     //
 497     UnicodeString patString(pat);
 498     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 499     if (status != expectedStatus) {
 500         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 501     } else {
 502         if (status != U_ZERO_ERROR) {
 503             if (pe.line != errLine || pe.offset != errCol) {
 504                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 505                     line, errLine, errCol, pe.line, pe.offset);
 506             }
 507         }
 508     }
 509
 510     delete callerPattern;
 511
 512     //
 513     //  Compile again, using a UTF-8-based UText
 514     //
 515     UText patternText = UTEXT_INITIALIZER;
 516     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 517     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 518     if (status != expectedStatus) {
 519         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 520     } else {
 521         if (status != U_ZERO_ERROR) {
 522             if (pe.line != errLine || pe.offset != errCol) {
 523                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 524                     line, errLine, errCol, pe.line, pe.offset);
 525             }
 526         }
 527     }
 528
 529     delete callerPattern;
 530     utext_close(&patternText);
 531 }
 532
 533
 534
 535 //---------------------------------------------------------------------------
 536 //
 537 //      Basic      Check for basic functionality of regex pattern matching.
 538 //                 Avoid the use of REGEX_FIND test macro, which has
 539 //                 substantial dependencies on basic Regex functionality.
 540 //
 541 //---------------------------------------------------------------------------
 542 void RegexTest::Basic() {
 543
 544
 545 //
 546 // Debug - slide failing test cases early
 547 //
 548 #if 0
 549     {
 550         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 551         UParseError pe;
 552         UErrorCode  status = U_ZERO_ERROR;
 553         RegexPattern *pattern;
 554         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 555         pattern->dumpPattern();
 556         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 557         UBool result = m->find();
 558         printf("result = %d\n", result);
 559         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 560         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 561     }
 562     exit(1);
 563 #endif
 564
 565
 566     //
 567     // Pattern with parentheses
 568     //
 569     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 570     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 571     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 572
 573     //
 574     // Patterns with *
 575     //
 576     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 577     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 578     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 579     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 580     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 581
 582     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 583     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 584
 585
 586     //
 587     //  Patterns with "."
 588     //
 589     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 590     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 591     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 592     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 593     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 594     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 595     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 596     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 597
 598     //
 599     //  Patterns with * applied to chars at end of literal string
 600     //
 601     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 602     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 603
 604     //
 605     //  Supplemental chars match as single chars, not a pair of surrogates.
 606     //
 607     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 608     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 609     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 610
 611
 612     //
 613     //  UnicodeSets in the pattern
 614     //
 615     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 616     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 617     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 618     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 619     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 620     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 621
 622     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 623     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 624     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 625     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 626     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 627
 628     //
 629     //   OR operator in patterns
 630     //
 631     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 632     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 633     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 634     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 635
 636     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 637     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 638     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 639     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 640     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 641     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 642
 643     //
 644     //  +
 645     //
 646     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 647     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 648     REGEX_TESTLM("b+", "", FALSE, FALSE);
 649     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 650     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 651     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 652
 653     //
 654     //   ?
 655     //
 656     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 657     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 658     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 659     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 660     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 661     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 662     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 663     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 664     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 665
 666     //
 667     //  Escape sequences that become single literal chars, handled internally
 668     //   by ICU's Unescape.
 669     //
 670
 671     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 672     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 673     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 674     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 675     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 676     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 677     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 678     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 679     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 680     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 681
 682     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 683     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 684
 685     // Escape of special chars in patterns
 686     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 687 }
 688
 689
 690 //---------------------------------------------------------------------------
 691 //
 692 //    UTextBasic   Check for quirks that are specific to the UText
 693 //                 implementation.
 694 //
 695 //---------------------------------------------------------------------------
 696 void RegexTest::UTextBasic() {
 697     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 698     UErrorCode status = U_ZERO_ERROR;
 699     UText pattern = UTEXT_INITIALIZER;
 700     utext_openUTF8(&pattern, str_abc, -1, &status);
 701     RegexMatcher matcher(&pattern, 0, status);
 702     REGEX_CHECK_STATUS;
 703
 704     UText input = UTEXT_INITIALIZER;
 705     utext_openUTF8(&input, str_abc, -1, &status);
 706     REGEX_CHECK_STATUS;
 707     matcher.reset(&input);
 708     REGEX_CHECK_STATUS;
 709     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 710
 711     matcher.reset(matcher.inputText());
 712     REGEX_CHECK_STATUS;
 713     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 714
 715     utext_close(&pattern);
 716     utext_close(&input);
 717 }
 718
 719
 720 //---------------------------------------------------------------------------
 721 //
 722 //      API_Match   Test that the API for class RegexMatcher
 723 //                  is present and nominally working, but excluding functions
 724 //                  implementing replace operations.
 725 //
 726 //---------------------------------------------------------------------------
 727 void RegexTest::API_Match() {
 728     UParseError         pe;
 729     UErrorCode          status=U_ZERO_ERROR;
 730     int32_t             flags = 0;
 731
 732     //
 733     // Debug - slide failing test cases early
 734     //
 735 #if 0
 736     {
 737     }
 738     return;
 739 #endif
 740
 741     //
 742     // Simple pattern compilation
 743     //
 744     {
 745         UnicodeString       re("abc");
 746         RegexPattern        *pat2;
 747         pat2 = RegexPattern::compile(re, flags, pe, status);
 748         REGEX_CHECK_STATUS;
 749
 750         UnicodeString inStr1 = "abcdef this is a test";
 751         UnicodeString instr2 = "not abc";
 752         UnicodeString empty  = "";
 753
 754
 755         //
 756         // Matcher creation and reset.
 757         //
 758         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 759         REGEX_CHECK_STATUS;
 760         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 761         REGEX_ASSERT(m1->input() == inStr1);
 762         m1->reset(instr2);
 763         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 764         REGEX_ASSERT(m1->input() == instr2);
 765         m1->reset(inStr1);
 766         REGEX_ASSERT(m1->input() == inStr1);
 767         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 768         m1->reset(empty);
 769         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 770         REGEX_ASSERT(m1->input() == empty);
 771         REGEX_ASSERT(&m1->pattern() == pat2);
 772
 773         //
 774         //  reset(pos, status)
 775         //
 776         m1->reset(inStr1);
 777         m1->reset(4, status);
 778         REGEX_CHECK_STATUS;
 779         REGEX_ASSERT(m1->input() == inStr1);
 780         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 781
 782         m1->reset(-1, status);
 783         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 784         status = U_ZERO_ERROR;
 785
 786         m1->reset(0, status);
 787         REGEX_CHECK_STATUS;
 788         status = U_ZERO_ERROR;
 789
 790         int32_t len = m1->input().length();
 791         m1->reset(len-1, status);
 792         REGEX_CHECK_STATUS;
 793         status = U_ZERO_ERROR;
 794
 795         m1->reset(len, status);
 796         REGEX_CHECK_STATUS;
 797         status = U_ZERO_ERROR;
 798
 799         m1->reset(len+1, status);
 800         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 801         status = U_ZERO_ERROR;
 802
 803         //
 804         // match(pos, status)
 805         //
 806         m1->reset(instr2);
 807         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 808         m1->reset();
 809         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 810         m1->reset();
 811         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 812         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 813         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 814         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 815
 816         // Match() at end of string should fail, but should not
 817         //  be an error.
 818         status = U_ZERO_ERROR;
 819         len = m1->input().length();
 820         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 821         REGEX_CHECK_STATUS;
 822
 823         // Match beyond end of string should fail with an error.
 824         status = U_ZERO_ERROR;
 825         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 826         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 827
 828         // Successful match at end of string.
 829         {
 830             status = U_ZERO_ERROR;
 831             RegexMatcher m("A?", 0, status);  // will match zero length string.
 832             REGEX_CHECK_STATUS;
 833             m.reset(inStr1);
 834             len = inStr1.length();
 835             REGEX_ASSERT(m.matches(len, status) == TRUE);
 836             REGEX_CHECK_STATUS;
 837             m.reset(empty);
 838             REGEX_ASSERT(m.matches(0, status) == TRUE);
 839             REGEX_CHECK_STATUS;
 840         }
 841
 842
 843         //
 844         // lookingAt(pos, status)
 845         //
 846         status = U_ZERO_ERROR;
 847         m1->reset(instr2);  // "not abc"
 848         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 849         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 850         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 851         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 852         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 853         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 854         status = U_ZERO_ERROR;
 855         len = m1->input().length();
 856         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 857         REGEX_CHECK_STATUS;
 858         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 859         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 860
 861         delete m1;
 862         delete pat2;
 863     }
 864
 865
 866     //
 867     // Capture Group.
 868     //     RegexMatcher::start();
 869     //     RegexMatcher::end();
 870     //     RegexMatcher::groupCount();
 871     //
 872     {
 873         int32_t             flags=0;
 874         UParseError         pe;
 875         UErrorCode          status=U_ZERO_ERROR;
 876
 877         UnicodeString       re("01(23(45)67)(.*)");
 878         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 879         REGEX_CHECK_STATUS;
 880         UnicodeString data = "0123456789";
 881
 882         RegexMatcher *matcher = pat->matcher(data, status);
 883         REGEX_CHECK_STATUS;
 884         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 885         static const int32_t matchStarts[] = {0,  2, 4, 8};
 886         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 887         int32_t i;
 888         for (i=0; i<4; i++) {
 889             int32_t actualStart = matcher->start(i, status);
 890             REGEX_CHECK_STATUS;
 891             if (actualStart != matchStarts[i]) {
 892                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 893                     __LINE__, i, matchStarts[i], actualStart);
 894             }
 895             int32_t actualEnd = matcher->end(i, status);
 896             REGEX_CHECK_STATUS;
 897             if (actualEnd != matchEnds[i]) {
 898                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 899                     __LINE__, i, matchEnds[i], actualEnd);
 900             }
 901         }
 902
 903         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 904         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 905
 906         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 907         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 908         matcher->reset();
 909         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 910
 911         matcher->lookingAt(status);
 912         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 913         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 914         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 915         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 916         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 917         REGEX_CHECK_STATUS;
 918         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 919         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 920         matcher->reset();
 921         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 922
 923         delete matcher;
 924         delete pat;
 925
 926     }
 927
 928     //
 929     //  find
 930     //
 931     {
 932         int32_t             flags=0;
 933         UParseError         pe;
 934         UErrorCode          status=U_ZERO_ERROR;
 935
 936         UnicodeString       re("abc");
 937         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 938         REGEX_CHECK_STATUS;
 939         UnicodeString data = ".abc..abc...abc..";
 940         //                    012345678901234567
 941
 942         RegexMatcher *matcher = pat->matcher(data, status);
 943         REGEX_CHECK_STATUS;
 944         REGEX_ASSERT(matcher->find());
 945         REGEX_ASSERT(matcher->start(status) == 1);
 946         REGEX_ASSERT(matcher->find());
 947         REGEX_ASSERT(matcher->start(status) == 6);
 948         REGEX_ASSERT(matcher->find());
 949         REGEX_ASSERT(matcher->start(status) == 12);
 950         REGEX_ASSERT(matcher->find() == FALSE);
 951         REGEX_ASSERT(matcher->find() == FALSE);
 952
 953         matcher->reset();
 954         REGEX_ASSERT(matcher->find());
 955         REGEX_ASSERT(matcher->start(status) == 1);
 956
 957         REGEX_ASSERT(matcher->find(0, status));
 958         REGEX_ASSERT(matcher->start(status) == 1);
 959         REGEX_ASSERT(matcher->find(1, status));
 960         REGEX_ASSERT(matcher->start(status) == 1);
 961         REGEX_ASSERT(matcher->find(2, status));
 962         REGEX_ASSERT(matcher->start(status) == 6);
 963         REGEX_ASSERT(matcher->find(12, status));
 964         REGEX_ASSERT(matcher->start(status) == 12);
 965         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 966         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 967         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 968         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
 969
 970         status = U_ZERO_ERROR;
 971         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 972         status = U_ZERO_ERROR;
 973         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
 974
 975         REGEX_ASSERT(matcher->groupCount() == 0);
 976
 977         delete matcher;
 978         delete pat;
 979     }
 980
 981
 982     //
 983     //  find, with \G in pattern (true if at the end of a previous match).
 984     //
 985     {
 986         int32_t             flags=0;
 987         UParseError         pe;
 988         UErrorCode          status=U_ZERO_ERROR;
 989
 990         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
 991         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 992         REGEX_CHECK_STATUS;
 993         UnicodeString data = ".abcabc.abc..";
 994         //                    012345678901234567
 995
 996         RegexMatcher *matcher = pat->matcher(data, status);
 997         REGEX_CHECK_STATUS;
 998         REGEX_ASSERT(matcher->find());
 999         REGEX_ASSERT(matcher->start(status) == 0);
1000         REGEX_ASSERT(matcher->start(1, status) == -1);
1001         REGEX_ASSERT(matcher->start(2, status) == 1);
1002
1003         REGEX_ASSERT(matcher->find());
1004         REGEX_ASSERT(matcher->start(status) == 4);
1005         REGEX_ASSERT(matcher->start(1, status) == 4);
1006         REGEX_ASSERT(matcher->start(2, status) == -1);
1007         REGEX_CHECK_STATUS;
1008
1009         delete matcher;
1010         delete pat;
1011     }
1012
1013     //
1014     //   find with zero length matches, match position should bump ahead
1015     //     to prevent loops.
1016     //
1017     {
1018         int32_t                 i;
1019         UErrorCode          status=U_ZERO_ERROR;
1020         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1021                                                       //   using an always-true look-ahead.
1022         REGEX_CHECK_STATUS;
1023         UnicodeString s("    ");
1024         m.reset(s);
1025         for (i=0; ; i++) {
1026             if (m.find() == FALSE) {
1027                 break;
1028             }
1029             REGEX_ASSERT(m.start(status) == i);
1030             REGEX_ASSERT(m.end(status) == i);
1031         }
1032         REGEX_ASSERT(i==5);
1033
1034         // Check that the bump goes over surrogate pairs OK
1035         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1036         s = s.unescape();
1037         m.reset(s);
1038         for (i=0; ; i+=2) {
1039             if (m.find() == FALSE) {
1040                 break;
1041             }
1042             REGEX_ASSERT(m.start(status) == i);
1043             REGEX_ASSERT(m.end(status) == i);
1044         }
1045         REGEX_ASSERT(i==10);
1046     }
1047     {
1048         // find() loop breaking test.
1049         //        with pattern of /.?/, should see a series of one char matches, then a single
1050         //        match of zero length at the end of the input string.
1051         int32_t                 i;
1052         UErrorCode          status=U_ZERO_ERROR;
1053         RegexMatcher        m(".?", 0, status);
1054         REGEX_CHECK_STATUS;
1055         UnicodeString s("    ");
1056         m.reset(s);
1057         for (i=0; ; i++) {
1058             if (m.find() == FALSE) {
1059                 break;
1060             }
1061             REGEX_ASSERT(m.start(status) == i);
1062             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1063         }
1064         REGEX_ASSERT(i==5);
1065     }
1066
1067
1068     //
1069     // Matchers with no input string behave as if they had an empty input string.
1070     //
1071
1072     {
1073         UErrorCode status = U_ZERO_ERROR;
1074         RegexMatcher  m(".?", 0, status);
1075         REGEX_CHECK_STATUS;
1076         REGEX_ASSERT(m.find());
1077         REGEX_ASSERT(m.start(status) == 0);
1078         REGEX_ASSERT(m.input() == "");
1079     }
1080     {
1081         UErrorCode status = U_ZERO_ERROR;
1082         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1083         RegexMatcher  *m = p->matcher(status);
1084         REGEX_CHECK_STATUS;
1085
1086         REGEX_ASSERT(m->find() == FALSE);
1087         REGEX_ASSERT(m->input() == "");
1088         delete m;
1089         delete p;
1090     }
1091
1092     //
1093     // Regions
1094     //
1095     {
1096         UErrorCode status = U_ZERO_ERROR;
1097         UnicodeString testString("This is test data");
1098         RegexMatcher m(".*", testString,  0, status);
1099         REGEX_CHECK_STATUS;
1100         REGEX_ASSERT(m.regionStart() == 0);
1101         REGEX_ASSERT(m.regionEnd() == testString.length());
1102         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1103         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1104
1105         m.region(2,4, status);
1106         REGEX_CHECK_STATUS;
1107         REGEX_ASSERT(m.matches(status));
1108         REGEX_ASSERT(m.start(status)==2);
1109         REGEX_ASSERT(m.end(status)==4);
1110         REGEX_CHECK_STATUS;
1111
1112         m.reset();
1113         REGEX_ASSERT(m.regionStart() == 0);
1114         REGEX_ASSERT(m.regionEnd() == testString.length());
1115
1116         UnicodeString shorterString("short");
1117         m.reset(shorterString);
1118         REGEX_ASSERT(m.regionStart() == 0);
1119         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1120
1121         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1122         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1123         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1124         REGEX_ASSERT(&m == &m.reset());
1125         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1126
1127         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1128         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1129         REGEX_ASSERT(&m == &m.reset());
1130         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1131
1132         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1133         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1134         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1135         REGEX_ASSERT(&m == &m.reset());
1136         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1137
1138         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1139         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1140         REGEX_ASSERT(&m == &m.reset());
1141         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1142
1143     }
1144
1145     //
1146     // hitEnd() and requireEnd()
1147     //
1148     {
1149         UErrorCode status = U_ZERO_ERROR;
1150         UnicodeString testString("aabb");
1151         RegexMatcher m1(".*", testString,  0, status);
1152         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1153         REGEX_ASSERT(m1.hitEnd() == TRUE);
1154         REGEX_ASSERT(m1.requireEnd() == FALSE);
1155         REGEX_CHECK_STATUS;
1156
1157         status = U_ZERO_ERROR;
1158         RegexMatcher m2("a*", testString, 0, status);
1159         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1160         REGEX_ASSERT(m2.hitEnd() == FALSE);
1161         REGEX_ASSERT(m2.requireEnd() == FALSE);
1162         REGEX_CHECK_STATUS;
1163
1164         status = U_ZERO_ERROR;
1165         RegexMatcher m3(".*$", testString, 0, status);
1166         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1167         REGEX_ASSERT(m3.hitEnd() == TRUE);
1168         REGEX_ASSERT(m3.requireEnd() == TRUE);
1169         REGEX_CHECK_STATUS;
1170     }
1171
1172
1173     //
1174     // Compilation error on reset with UChar *
1175     //   These were a hazard that people were stumbling over with runtime errors.
1176     //   Changed them to compiler errors by adding private methods that more closely
1177     //   matched the incorrect use of the functions.
1178     //
1179 #if 0
1180     {
1181         UErrorCode status = U_ZERO_ERROR;
1182         UChar ucharString[20];
1183         RegexMatcher m(".", 0, status);
1184         m.reset(ucharString);  // should not compile.
1185
1186         RegexPattern *p = RegexPattern::compile(".", 0, status);
1187         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1188
1189         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1190     }
1191 #endif
1192
1193     //
1194     //  Time Outs.
1195     //       Note:  These tests will need to be changed when the regexp engine is
1196     //              able to detect and cut short the exponential time behavior on
1197     //              this type of match.
1198     //
1199     {
1200         UErrorCode status = U_ZERO_ERROR;
1201         //    Enough 'a's in the string to cause the match to time out.
1202         //       (Each on additonal 'a' doubles the time)
1203         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1204         RegexMatcher matcher("(a+)+b", testString, 0, status);
1205         REGEX_CHECK_STATUS;
1206         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1207         matcher.setTimeLimit(100, status);
1208         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1209         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1210         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1211     }
1212     {
1213         UErrorCode status = U_ZERO_ERROR;
1214         //   Few enough 'a's to slip in under the time limit.
1215         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1216         RegexMatcher matcher("(a+)+b", testString, 0, status);
1217         REGEX_CHECK_STATUS;
1218         matcher.setTimeLimit(100, status);
1219         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1220         REGEX_CHECK_STATUS;
1221     }
1222
1223     //
1224     //  Stack Limits
1225     //
1226     {
1227         UErrorCode status = U_ZERO_ERROR;
1228         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1229
1230         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1231         //   of the '+', and makes the stack frames larger.
1232         RegexMatcher matcher("(A)+A$", testString, 0, status);
1233
1234         // With the default stack, this match should fail to run
1235         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1237
1238         // With unlimited stack, it should run
1239         status = U_ZERO_ERROR;
1240         matcher.setStackLimit(0, status);
1241         REGEX_CHECK_STATUS;
1242         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1243         REGEX_CHECK_STATUS;
1244         REGEX_ASSERT(matcher.getStackLimit() == 0);
1245
1246         // With a limited stack, it the match should fail
1247         status = U_ZERO_ERROR;
1248         matcher.setStackLimit(10000, status);
1249         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1250         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1251         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1252     }
1253
1254         // A pattern that doesn't save state should work with
1255         //   a minimal sized stack
1256     {
1257         UErrorCode status = U_ZERO_ERROR;
1258         UnicodeString testString = "abc";
1259         RegexMatcher matcher("abc", testString, 0, status);
1260         REGEX_CHECK_STATUS;
1261         matcher.setStackLimit(30, status);
1262         REGEX_CHECK_STATUS;
1263         REGEX_ASSERT(matcher.matches(status) == TRUE);
1264         REGEX_CHECK_STATUS;
1265         REGEX_ASSERT(matcher.getStackLimit() == 30);
1266
1267         // Negative stack sizes should fail
1268         status = U_ZERO_ERROR;
1269         matcher.setStackLimit(1000, status);
1270         REGEX_CHECK_STATUS;
1271         matcher.setStackLimit(-1, status);
1272         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1273         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1274     }
1275
1276
1277 }
1278
1279
1280
1281
1282
1283
1284 //---------------------------------------------------------------------------
1285 //
1286 //      API_Replace        API test for class RegexMatcher, testing the
1287 //                         Replace family of functions.
1288 //
1289 //---------------------------------------------------------------------------
1290 void RegexTest::API_Replace() {
1291     //
1292     //  Replace
1293     //
1294     int32_t             flags=0;
1295     UParseError         pe;
1296     UErrorCode          status=U_ZERO_ERROR;
1297
1298     UnicodeString       re("abc");
1299     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1300     REGEX_CHECK_STATUS;
1301     UnicodeString data = ".abc..abc...abc..";
1302     //                    012345678901234567
1303     RegexMatcher *matcher = pat->matcher(data, status);
1304
1305     //
1306     //  Plain vanilla matches.
1307     //
1308     UnicodeString  dest;
1309     dest = matcher->replaceFirst("yz", status);
1310     REGEX_CHECK_STATUS;
1311     REGEX_ASSERT(dest == ".yz..abc...abc..");
1312
1313     dest = matcher->replaceAll("yz", status);
1314     REGEX_CHECK_STATUS;
1315     REGEX_ASSERT(dest == ".yz..yz...yz..");
1316
1317     //
1318     //  Plain vanilla non-matches.
1319     //
1320     UnicodeString d2 = ".abx..abx...abx..";
1321     matcher->reset(d2);
1322     dest = matcher->replaceFirst("yz", status);
1323     REGEX_CHECK_STATUS;
1324     REGEX_ASSERT(dest == ".abx..abx...abx..");
1325
1326     dest = matcher->replaceAll("yz", status);
1327     REGEX_CHECK_STATUS;
1328     REGEX_ASSERT(dest == ".abx..abx...abx..");
1329
1330     //
1331     // Empty source string
1332     //
1333     UnicodeString d3 = "";
1334     matcher->reset(d3);
1335     dest = matcher->replaceFirst("yz", status);
1336     REGEX_CHECK_STATUS;
1337     REGEX_ASSERT(dest == "");
1338
1339     dest = matcher->replaceAll("yz", status);
1340     REGEX_CHECK_STATUS;
1341     REGEX_ASSERT(dest == "");
1342
1343     //
1344     // Empty substitution string
1345     //
1346     matcher->reset(data);              // ".abc..abc...abc.."
1347     dest = matcher->replaceFirst("", status);
1348     REGEX_CHECK_STATUS;
1349     REGEX_ASSERT(dest == "...abc...abc..");
1350
1351     dest = matcher->replaceAll("", status);
1352     REGEX_CHECK_STATUS;
1353     REGEX_ASSERT(dest == "........");
1354
1355     //
1356     // match whole string
1357     //
1358     UnicodeString d4 = "abc";
1359     matcher->reset(d4);
1360     dest = matcher->replaceFirst("xyz", status);
1361     REGEX_CHECK_STATUS;
1362     REGEX_ASSERT(dest == "xyz");
1363
1364     dest = matcher->replaceAll("xyz", status);
1365     REGEX_CHECK_STATUS;
1366     REGEX_ASSERT(dest == "xyz");
1367
1368     //
1369     // Capture Group, simple case
1370     //
1371     UnicodeString       re2("a(..)");
1372     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1373     REGEX_CHECK_STATUS;
1374     UnicodeString d5 = "abcdefg";
1375     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1376     REGEX_CHECK_STATUS;
1377     dest = matcher2->replaceFirst("$1$1", status);
1378     REGEX_CHECK_STATUS;
1379     REGEX_ASSERT(dest == "bcbcdefg");
1380
1381     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1382     REGEX_CHECK_STATUS;
1383     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1384
1385     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1386     REGEX_ASSERT(U_FAILURE(status));
1387     status = U_ZERO_ERROR;
1388
1389     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1390     replacement = replacement.unescape();
1391     dest = matcher2->replaceFirst(replacement, status);
1392     REGEX_CHECK_STATUS;
1393     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1394
1395     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1396
1397
1398     //
1399     // Replacement String with \u hex escapes
1400     //
1401     {
1402         UnicodeString  src = "abc 1 abc 2 abc 3";
1403         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1404         matcher->reset(src);
1405         UnicodeString  result = matcher->replaceAll(substitute, status);
1406         REGEX_CHECK_STATUS;
1407         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1408     }
1409     {
1410         UnicodeString  src = "abc !";
1411         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1412         matcher->reset(src);
1413         UnicodeString  result = matcher->replaceAll(substitute, status);
1414         REGEX_CHECK_STATUS;
1415         UnicodeString expected = UnicodeString("--");
1416         expected.append((UChar32)0x10000);
1417         expected.append("-- !");
1418         REGEX_ASSERT(result == expected);
1419     }
1420     // TODO:  need more through testing of capture substitutions.
1421
1422     // Bug 4057
1423     //
1424     {
1425         status = U_ZERO_ERROR;
1426         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1427         RegexMatcher m("ss(.*?)ee", 0, status);
1428         REGEX_CHECK_STATUS;
1429         UnicodeString result;
1430
1431         // Multiple finds do NOT bump up the previous appendReplacement postion.
1432         m.reset(s);
1433         m.find();
1434         m.find();
1435         m.appendReplacement(result, "ooh", status);
1436         REGEX_CHECK_STATUS;
1437         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1438
1439         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1440         status = U_ZERO_ERROR;
1441         result.truncate(0);
1442         m.reset(10, status);
1443         m.find();
1444         m.find();
1445         m.appendReplacement(result, "ooh", status);
1446         REGEX_CHECK_STATUS;
1447         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1448
1449         // find() at interior of string, appendReplacemnt still starts at beginning.
1450         status = U_ZERO_ERROR;
1451         result.truncate(0);
1452         m.reset();
1453         m.find(10, status);
1454         m.find();
1455         m.appendReplacement(result, "ooh", status);
1456         REGEX_CHECK_STATUS;
1457         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1458
1459         m.appendTail(result);
1460         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1461
1462     }
1463
1464     delete matcher2;
1465     delete pat2;
1466     delete matcher;
1467     delete pat;
1468 }
1469
1470
1471 //---------------------------------------------------------------------------
1472 //
1473 //      API_Pattern       Test that the API for class RegexPattern is
1474 //                        present and nominally working.
1475 //
1476 //---------------------------------------------------------------------------
1477 void RegexTest::API_Pattern() {
1478     RegexPattern        pata;    // Test default constructor to not crash.
1479     RegexPattern        patb;
1480
1481     REGEX_ASSERT(pata == patb);
1482     REGEX_ASSERT(pata == pata);
1483
1484     UnicodeString re1("abc[a-l][m-z]");
1485     UnicodeString re2("def");
1486     UErrorCode    status = U_ZERO_ERROR;
1487     UParseError   pe;
1488
1489     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1490     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1491     REGEX_CHECK_STATUS;
1492     REGEX_ASSERT(*pat1 == *pat1);
1493     REGEX_ASSERT(*pat1 != pata);
1494
1495     // Assign
1496     patb = *pat1;
1497     REGEX_ASSERT(patb == *pat1);
1498
1499     // Copy Construct
1500     RegexPattern patc(*pat1);
1501     REGEX_ASSERT(patc == *pat1);
1502     REGEX_ASSERT(patb == patc);
1503     REGEX_ASSERT(pat1 != pat2);
1504     patb = *pat2;
1505     REGEX_ASSERT(patb != patc);
1506     REGEX_ASSERT(patb == *pat2);
1507
1508     // Compile with no flags.
1509     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1510     REGEX_ASSERT(*pat1a == *pat1);
1511
1512     REGEX_ASSERT(pat1a->flags() == 0);
1513
1514     // Compile with different flags should be not equal
1515     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1516     REGEX_CHECK_STATUS;
1517
1518     REGEX_ASSERT(*pat1b != *pat1a);
1519     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1520     REGEX_ASSERT(pat1a->flags() == 0);
1521     delete pat1b;
1522
1523     // clone
1524     RegexPattern *pat1c = pat1->clone();
1525     REGEX_ASSERT(*pat1c == *pat1);
1526     REGEX_ASSERT(*pat1c != *pat2);
1527
1528     delete pat1c;
1529     delete pat1a;
1530     delete pat1;
1531     delete pat2;
1532
1533
1534     //
1535     //   Verify that a matcher created from a cloned pattern works.
1536     //     (Jitterbug 3423)
1537     //
1538     {
1539         UErrorCode     status     = U_ZERO_ERROR;
1540         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1541         RegexPattern  *pClone     = pSource->clone();
1542         delete         pSource;
1543         RegexMatcher  *mFromClone = pClone->matcher(status);
1544         REGEX_CHECK_STATUS;
1545         UnicodeString s = "Hello World";
1546         mFromClone->reset(s);
1547         REGEX_ASSERT(mFromClone->find() == TRUE);
1548         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1549         REGEX_ASSERT(mFromClone->find() == TRUE);
1550         REGEX_ASSERT(mFromClone->group(status) == "World");
1551         REGEX_ASSERT(mFromClone->find() == FALSE);
1552         delete mFromClone;
1553         delete pClone;
1554     }
1555
1556     //
1557     //   matches convenience API
1558     //
1559     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1560     REGEX_CHECK_STATUS;
1561     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1562     REGEX_CHECK_STATUS;
1563     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1564     REGEX_CHECK_STATUS;
1565     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1566     REGEX_CHECK_STATUS;
1567     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1568     REGEX_CHECK_STATUS;
1569     status = U_INDEX_OUTOFBOUNDS_ERROR;
1570     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1571     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1572
1573
1574     //
1575     // Split()
1576     //
1577     status = U_ZERO_ERROR;
1578     pat1 = RegexPattern::compile(" +",  pe, status);
1579     REGEX_CHECK_STATUS;
1580     UnicodeString  fields[10];
1581
1582     int32_t n;
1583     n = pat1->split("Now is the time", fields, 10, status);
1584     REGEX_CHECK_STATUS;
1585     REGEX_ASSERT(n==4);
1586     REGEX_ASSERT(fields[0]=="Now");
1587     REGEX_ASSERT(fields[1]=="is");
1588     REGEX_ASSERT(fields[2]=="the");
1589     REGEX_ASSERT(fields[3]=="time");
1590     REGEX_ASSERT(fields[4]=="");
1591
1592     n = pat1->split("Now is the time", fields, 2, status);
1593     REGEX_CHECK_STATUS;
1594     REGEX_ASSERT(n==2);
1595     REGEX_ASSERT(fields[0]=="Now");
1596     REGEX_ASSERT(fields[1]=="is the time");
1597     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1598
1599     fields[1] = "*";
1600     status = U_ZERO_ERROR;
1601     n = pat1->split("Now is the time", fields, 1, status);
1602     REGEX_CHECK_STATUS;
1603     REGEX_ASSERT(n==1);
1604     REGEX_ASSERT(fields[0]=="Now is the time");
1605     REGEX_ASSERT(fields[1]=="*");
1606     status = U_ZERO_ERROR;
1607
1608     n = pat1->split("    Now       is the time   ", fields, 10, status);
1609     REGEX_CHECK_STATUS;
1610     REGEX_ASSERT(n==6);
1611     REGEX_ASSERT(fields[0]=="");
1612     REGEX_ASSERT(fields[1]=="Now");
1613     REGEX_ASSERT(fields[2]=="is");
1614     REGEX_ASSERT(fields[3]=="the");
1615     REGEX_ASSERT(fields[4]=="time");
1616     REGEX_ASSERT(fields[5]=="");
1617
1618     n = pat1->split("     ", fields, 10, status);
1619     REGEX_CHECK_STATUS;
1620     REGEX_ASSERT(n==2);
1621     REGEX_ASSERT(fields[0]=="");
1622     REGEX_ASSERT(fields[1]=="");
1623
1624     fields[0] = "foo";
1625     n = pat1->split("", fields, 10, status);
1626     REGEX_CHECK_STATUS;
1627     REGEX_ASSERT(n==0);
1628     REGEX_ASSERT(fields[0]=="foo");
1629
1630     delete pat1;
1631
1632     //  split, with a pattern with (capture)
1633     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1634     REGEX_CHECK_STATUS;
1635
1636     status = U_ZERO_ERROR;
1637     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1638     REGEX_CHECK_STATUS;
1639     REGEX_ASSERT(n==7);
1640     REGEX_ASSERT(fields[0]=="");
1641     REGEX_ASSERT(fields[1]=="a");
1642     REGEX_ASSERT(fields[2]=="Now is ");
1643     REGEX_ASSERT(fields[3]=="b");
1644     REGEX_ASSERT(fields[4]=="the time");
1645     REGEX_ASSERT(fields[5]=="c");
1646     REGEX_ASSERT(fields[6]=="");
1647     REGEX_ASSERT(status==U_ZERO_ERROR);
1648
1649     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1650     REGEX_CHECK_STATUS;
1651     REGEX_ASSERT(n==7);
1652     REGEX_ASSERT(fields[0]=="  ");
1653     REGEX_ASSERT(fields[1]=="a");
1654     REGEX_ASSERT(fields[2]=="Now is ");
1655     REGEX_ASSERT(fields[3]=="b");
1656     REGEX_ASSERT(fields[4]=="the time");
1657     REGEX_ASSERT(fields[5]=="c");
1658     REGEX_ASSERT(fields[6]=="");
1659
1660     status = U_ZERO_ERROR;
1661     fields[6] = "foo";
1662     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1663     REGEX_CHECK_STATUS;
1664     REGEX_ASSERT(n==6);
1665     REGEX_ASSERT(fields[0]=="  ");
1666     REGEX_ASSERT(fields[1]=="a");
1667     REGEX_ASSERT(fields[2]=="Now is ");
1668     REGEX_ASSERT(fields[3]=="b");
1669     REGEX_ASSERT(fields[4]=="the time");
1670     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1671     REGEX_ASSERT(fields[6]=="foo");
1672
1673     status = U_ZERO_ERROR;
1674     fields[5] = "foo";
1675     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1676     REGEX_CHECK_STATUS;
1677     REGEX_ASSERT(n==5);
1678     REGEX_ASSERT(fields[0]=="  ");
1679     REGEX_ASSERT(fields[1]=="a");
1680     REGEX_ASSERT(fields[2]=="Now is ");
1681     REGEX_ASSERT(fields[3]=="b");
1682     REGEX_ASSERT(fields[4]=="the time<c>");
1683     REGEX_ASSERT(fields[5]=="foo");
1684
1685     status = U_ZERO_ERROR;
1686     fields[5] = "foo";
1687     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1688     REGEX_CHECK_STATUS;
1689     REGEX_ASSERT(n==5);
1690     REGEX_ASSERT(fields[0]=="  ");
1691     REGEX_ASSERT(fields[1]=="a");
1692     REGEX_ASSERT(fields[2]=="Now is ");
1693     REGEX_ASSERT(fields[3]=="b");
1694     REGEX_ASSERT(fields[4]=="the time");
1695     REGEX_ASSERT(fields[5]=="foo");
1696
1697     status = U_ZERO_ERROR;
1698     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1699     REGEX_CHECK_STATUS;
1700     REGEX_ASSERT(n==4);
1701     REGEX_ASSERT(fields[0]=="  ");
1702     REGEX_ASSERT(fields[1]=="a");
1703     REGEX_ASSERT(fields[2]=="Now is ");
1704     REGEX_ASSERT(fields[3]=="the time<c>");
1705     status = U_ZERO_ERROR;
1706     delete pat1;
1707
1708     pat1 = RegexPattern::compile("([-,])",  pe, status);
1709     REGEX_CHECK_STATUS;
1710     n = pat1->split("1-10,20", fields, 10, status);
1711     REGEX_CHECK_STATUS;
1712     REGEX_ASSERT(n==5);
1713     REGEX_ASSERT(fields[0]=="1");
1714     REGEX_ASSERT(fields[1]=="-");
1715     REGEX_ASSERT(fields[2]=="10");
1716     REGEX_ASSERT(fields[3]==",");
1717     REGEX_ASSERT(fields[4]=="20");
1718     delete pat1;
1719
1720     // Test split of string with empty trailing fields
1721     pat1 = RegexPattern::compile(",", pe, status);
1722     REGEX_CHECK_STATUS;
1723     n = pat1->split("a,b,c,", fields, 10, status);
1724     REGEX_CHECK_STATUS;
1725     REGEX_ASSERT(n==4);
1726     REGEX_ASSERT(fields[0]=="a");
1727     REGEX_ASSERT(fields[1]=="b");
1728     REGEX_ASSERT(fields[2]=="c");
1729     REGEX_ASSERT(fields[3]=="");
1730
1731     n = pat1->split("a,,,", fields, 10, status);
1732     REGEX_CHECK_STATUS;
1733     REGEX_ASSERT(n==4);
1734     REGEX_ASSERT(fields[0]=="a");
1735     REGEX_ASSERT(fields[1]=="");
1736     REGEX_ASSERT(fields[2]=="");
1737     REGEX_ASSERT(fields[3]=="");
1738     delete pat1;
1739
1740     // Split Separator with zero length match.
1741     pat1 = RegexPattern::compile(":?", pe, status);
1742     REGEX_CHECK_STATUS;
1743     n = pat1->split("abc", fields, 10, status);
1744     REGEX_CHECK_STATUS;
1745     REGEX_ASSERT(n==5);
1746     REGEX_ASSERT(fields[0]=="");
1747     REGEX_ASSERT(fields[1]=="a");
1748     REGEX_ASSERT(fields[2]=="b");
1749     REGEX_ASSERT(fields[3]=="c");
1750     REGEX_ASSERT(fields[4]=="");
1751
1752     delete pat1;
1753
1754     //
1755     // RegexPattern::pattern()
1756     //
1757     pat1 = new RegexPattern();
1758     REGEX_ASSERT(pat1->pattern() == "");
1759     delete pat1;
1760
1761     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1762     REGEX_CHECK_STATUS;
1763     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1764     delete pat1;
1765
1766
1767     //
1768     // classID functions
1769     //
1770     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1771     REGEX_CHECK_STATUS;
1772     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1773     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1774     UnicodeString Hello("Hello, world.");
1775     RegexMatcher *m = pat1->matcher(Hello, status);
1776     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1777     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1778     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1779     delete m;
1780     delete pat1;
1781
1782 }
1783
1784 //---------------------------------------------------------------------------
1785 //
1786 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1787 //                       is present and working, but excluding functions
1788 //                       implementing replace operations.
1789 //
1790 //---------------------------------------------------------------------------
1791 void RegexTest::API_Match_UTF8() {
1792     UParseError         pe;
1793     UErrorCode          status=U_ZERO_ERROR;
1794     int32_t             flags = 0;
1795
1796     //
1797     // Debug - slide failing test cases early
1798     //
1799 #if 0
1800     {
1801     }
1802     return;
1803 #endif
1804
1805     //
1806     // Simple pattern compilation
1807     //
1808     {
1809         UText               re = UTEXT_INITIALIZER;
1810         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1811         REGEX_VERBOSE_TEXT(&re);
1812         RegexPattern        *pat2;
1813         pat2 = RegexPattern::compile(&re, flags, pe, status);
1814         REGEX_CHECK_STATUS;
1815
1816         UText input1 = UTEXT_INITIALIZER;
1817         UText input2 = UTEXT_INITIALIZER;
1818         UText empty  = UTEXT_INITIALIZER;
1819         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1820         REGEX_VERBOSE_TEXT(&input1);
1821         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1822         REGEX_VERBOSE_TEXT(&input2);
1823         utext_openUChars(&empty, NULL, 0, &status);
1824
1825         int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1826         int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1827
1828
1829         //
1830         // Matcher creation and reset.
1831         //
1832         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1833         REGEX_CHECK_STATUS;
1834         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1835         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1836         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1837         m1->reset(&input2);
1838         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1839         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1840         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1841         m1->reset(&input1);
1842         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1843         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1844         m1->reset(&empty);
1845         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1846         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1847
1848         //
1849         //  reset(pos, status)
1850         //
1851         m1->reset(&input1);
1852         m1->reset(4, status);
1853         REGEX_CHECK_STATUS;
1854         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1855         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1856
1857         m1->reset(-1, status);
1858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1859         status = U_ZERO_ERROR;
1860
1861         m1->reset(0, status);
1862         REGEX_CHECK_STATUS;
1863         status = U_ZERO_ERROR;
1864
1865         m1->reset(input1Len-1, status);
1866         REGEX_CHECK_STATUS;
1867         status = U_ZERO_ERROR;
1868
1869         m1->reset(input1Len, status);
1870         REGEX_CHECK_STATUS;
1871         status = U_ZERO_ERROR;
1872
1873         m1->reset(input1Len+1, status);
1874         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1875         status = U_ZERO_ERROR;
1876
1877         //
1878         // match(pos, status)
1879         //
1880         m1->reset(&input2);
1881         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1882         m1->reset();
1883         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1884         m1->reset();
1885         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1886         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1887         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1888         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1889
1890         // Match() at end of string should fail, but should not
1891         //  be an error.
1892         status = U_ZERO_ERROR;
1893         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1894         REGEX_CHECK_STATUS;
1895
1896         // Match beyond end of string should fail with an error.
1897         status = U_ZERO_ERROR;
1898         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1899         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1900
1901         // Successful match at end of string.
1902         {
1903             status = U_ZERO_ERROR;
1904             RegexMatcher m("A?", 0, status);  // will match zero length string.
1905             REGEX_CHECK_STATUS;
1906             m.reset(&input1);
1907             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1908             REGEX_CHECK_STATUS;
1909             m.reset(&empty);
1910             REGEX_ASSERT(m.matches(0, status) == TRUE);
1911             REGEX_CHECK_STATUS;
1912         }
1913
1914
1915         //
1916         // lookingAt(pos, status)
1917         //
1918         status = U_ZERO_ERROR;
1919         m1->reset(&input2);  // "not abc"
1920         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1921         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1922         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1923         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1924         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926         status = U_ZERO_ERROR;
1927         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1928         REGEX_CHECK_STATUS;
1929         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1930         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1931
1932         delete m1;
1933         delete pat2;
1934
1935         utext_close(&re);
1936         utext_close(&input1);
1937         utext_close(&input2);
1938         utext_close(&empty);
1939     }
1940
1941
1942     //
1943     // Capture Group.
1944     //     RegexMatcher::start();
1945     //     RegexMatcher::end();
1946     //     RegexMatcher::groupCount();
1947     //
1948     {
1949         int32_t             flags=0;
1950         UParseError         pe;
1951         UErrorCode          status=U_ZERO_ERROR;
1952         UText               re=UTEXT_INITIALIZER;
1953         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1954         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1955
1956         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1957         REGEX_CHECK_STATUS;
1958
1959         UText input = UTEXT_INITIALIZER;
1960         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1961         utext_openUTF8(&input, str_0123456789, -1, &status);
1962
1963         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1964         REGEX_CHECK_STATUS;
1965         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1966         static const int32_t matchStarts[] = {0,  2, 4, 8};
1967         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1968         int32_t i;
1969         for (i=0; i<4; i++) {
1970             int32_t actualStart = matcher->start(i, status);
1971             REGEX_CHECK_STATUS;
1972             if (actualStart != matchStarts[i]) {
1973                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1974                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1975             }
1976             int32_t actualEnd = matcher->end(i, status);
1977             REGEX_CHECK_STATUS;
1978             if (actualEnd != matchEnds[i]) {
1979                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1980                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1981             }
1982         }
1983
1984         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1985         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1986
1987         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1988         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1989         matcher->reset();
1990         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1991
1992         matcher->lookingAt(status);
1993
1994         UnicodeString dest;
1995         UText destText = UTEXT_INITIALIZER;
1996         utext_openUnicodeString(&destText, &dest, &status);
1997         UText *result;
1998         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1999         //  Test shallow-clone API
2000         int64_t   group_len;
2001         result = matcher->group((UText *)NULL, group_len, status);
2002         REGEX_CHECK_STATUS;
2003         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2004         utext_close(result);
2005         result = matcher->group(0, &destText, group_len, status);
2006         REGEX_CHECK_STATUS;
2007         REGEX_ASSERT(result == &destText);
2008         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2009         //  destText is now immutable, reopen it
2010         utext_close(&destText);
2011         utext_openUnicodeString(&destText, &dest, &status);
2012
2013         int64_t length;
2014         result = matcher->group(0, NULL, length, status);
2015         REGEX_CHECK_STATUS;
2016         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2017         utext_close(result);
2018         result = matcher->group(0, &destText, length, status);
2019         REGEX_CHECK_STATUS;
2020         REGEX_ASSERT(result == &destText);
2021         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2022         REGEX_ASSERT(length == 10);
2023         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2024
2025         // Capture Group 1 == "234567"
2026         result = matcher->group(1, NULL, length, status);
2027         REGEX_CHECK_STATUS;
2028         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2029         REGEX_ASSERT(length == 6);
2030         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2031         utext_close(result);
2032
2033         result = matcher->group(1, &destText, length, status);
2034         REGEX_CHECK_STATUS;
2035         REGEX_ASSERT(result == &destText);
2036         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2037         REGEX_ASSERT(length == 6);
2038         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2039         utext_close(result);
2040
2041         // Capture Group 2 == "45"
2042         result = matcher->group(2, NULL, length, status);
2043         REGEX_CHECK_STATUS;
2044         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2045         REGEX_ASSERT(length == 2);
2046         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2047         utext_close(result);
2048
2049         result = matcher->group(2, &destText, length, status);
2050         REGEX_CHECK_STATUS;
2051         REGEX_ASSERT(result == &destText);
2052         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2053         REGEX_ASSERT(length == 2);
2054         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2055         utext_close(result);
2056
2057         // Capture Group 3 == "89"
2058         result = matcher->group(3, NULL, length, status);
2059         REGEX_CHECK_STATUS;
2060         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2061         REGEX_ASSERT(length == 2);
2062         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063         utext_close(result);
2064
2065         result = matcher->group(3, &destText, length, status);
2066         REGEX_CHECK_STATUS;
2067         REGEX_ASSERT(result == &destText);
2068         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2069         REGEX_ASSERT(length == 2);
2070         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071         utext_close(result);
2072
2073         // Capture Group number out of range.
2074         status = U_ZERO_ERROR;
2075         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2076         status = U_ZERO_ERROR;
2077         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2078         status = U_ZERO_ERROR;
2079         matcher->reset();
2080         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2081
2082         delete matcher;
2083         delete pat;
2084
2085         utext_close(&destText);
2086         utext_close(&input);
2087         utext_close(&re);
2088     }
2089
2090     //
2091     //  find
2092     //
2093     {
2094         int32_t             flags=0;
2095         UParseError         pe;
2096         UErrorCode          status=U_ZERO_ERROR;
2097         UText               re=UTEXT_INITIALIZER;
2098         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2099         utext_openUTF8(&re, str_abc, -1, &status);
2100
2101         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2102         REGEX_CHECK_STATUS;
2103         UText input = UTEXT_INITIALIZER;
2104         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2105         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2106         //                      012345678901234567
2107
2108         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2109         REGEX_CHECK_STATUS;
2110         REGEX_ASSERT(matcher->find());
2111         REGEX_ASSERT(matcher->start(status) == 1);
2112         REGEX_ASSERT(matcher->find());
2113         REGEX_ASSERT(matcher->start(status) == 6);
2114         REGEX_ASSERT(matcher->find());
2115         REGEX_ASSERT(matcher->start(status) == 12);
2116         REGEX_ASSERT(matcher->find() == FALSE);
2117         REGEX_ASSERT(matcher->find() == FALSE);
2118
2119         matcher->reset();
2120         REGEX_ASSERT(matcher->find());
2121         REGEX_ASSERT(matcher->start(status) == 1);
2122
2123         REGEX_ASSERT(matcher->find(0, status));
2124         REGEX_ASSERT(matcher->start(status) == 1);
2125         REGEX_ASSERT(matcher->find(1, status));
2126         REGEX_ASSERT(matcher->start(status) == 1);
2127         REGEX_ASSERT(matcher->find(2, status));
2128         REGEX_ASSERT(matcher->start(status) == 6);
2129         REGEX_ASSERT(matcher->find(12, status));
2130         REGEX_ASSERT(matcher->start(status) == 12);
2131         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2132         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2133         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2134         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2135
2136         status = U_ZERO_ERROR;
2137         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138         status = U_ZERO_ERROR;
2139         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2140
2141         REGEX_ASSERT(matcher->groupCount() == 0);
2142
2143         delete matcher;
2144         delete pat;
2145
2146         utext_close(&input);
2147         utext_close(&re);
2148     }
2149
2150
2151     //
2152     //  find, with \G in pattern (true if at the end of a previous match).
2153     //
2154     {
2155         int32_t             flags=0;
2156         UParseError         pe;
2157         UErrorCode          status=U_ZERO_ERROR;
2158         UText               re=UTEXT_INITIALIZER;
2159         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2160         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2161
2162         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2163
2164         REGEX_CHECK_STATUS;
2165         UText input = UTEXT_INITIALIZER;
2166         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2167         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2168         //                      012345678901234567
2169
2170         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2171         REGEX_CHECK_STATUS;
2172         REGEX_ASSERT(matcher->find());
2173         REGEX_ASSERT(matcher->start(status) == 0);
2174         REGEX_ASSERT(matcher->start(1, status) == -1);
2175         REGEX_ASSERT(matcher->start(2, status) == 1);
2176
2177         REGEX_ASSERT(matcher->find());
2178         REGEX_ASSERT(matcher->start(status) == 4);
2179         REGEX_ASSERT(matcher->start(1, status) == 4);
2180         REGEX_ASSERT(matcher->start(2, status) == -1);
2181         REGEX_CHECK_STATUS;
2182
2183         delete matcher;
2184         delete pat;
2185
2186         utext_close(&input);
2187         utext_close(&re);
2188     }
2189
2190     //
2191     //   find with zero length matches, match position should bump ahead
2192     //     to prevent loops.
2193     //
2194     {
2195         int32_t                 i;
2196         UErrorCode          status=U_ZERO_ERROR;
2197         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2198                                                       //   using an always-true look-ahead.
2199         REGEX_CHECK_STATUS;
2200         UText s = UTEXT_INITIALIZER;
2201         utext_openUTF8(&s, "    ", -1, &status);
2202         m.reset(&s);
2203         for (i=0; ; i++) {
2204             if (m.find() == FALSE) {
2205                 break;
2206             }
2207             REGEX_ASSERT(m.start(status) == i);
2208             REGEX_ASSERT(m.end(status) == i);
2209         }
2210         REGEX_ASSERT(i==5);
2211
2212         // Check that the bump goes over characters outside the BMP OK
2213         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2214         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2215         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2216         m.reset(&s);
2217         for (i=0; ; i+=4) {
2218             if (m.find() == FALSE) {
2219                 break;
2220             }
2221             REGEX_ASSERT(m.start(status) == i);
2222             REGEX_ASSERT(m.end(status) == i);
2223         }
2224         REGEX_ASSERT(i==20);
2225
2226         utext_close(&s);
2227     }
2228     {
2229         // find() loop breaking test.
2230         //        with pattern of /.?/, should see a series of one char matches, then a single
2231         //        match of zero length at the end of the input string.
2232         int32_t                 i;
2233         UErrorCode          status=U_ZERO_ERROR;
2234         RegexMatcher        m(".?", 0, status);
2235         REGEX_CHECK_STATUS;
2236         UText s = UTEXT_INITIALIZER;
2237         utext_openUTF8(&s, "    ", -1, &status);
2238         m.reset(&s);
2239         for (i=0; ; i++) {
2240             if (m.find() == FALSE) {
2241                 break;
2242             }
2243             REGEX_ASSERT(m.start(status) == i);
2244             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2245         }
2246         REGEX_ASSERT(i==5);
2247
2248         utext_close(&s);
2249     }
2250
2251
2252     //
2253     // Matchers with no input string behave as if they had an empty input string.
2254     //
2255
2256     {
2257         UErrorCode status = U_ZERO_ERROR;
2258         RegexMatcher  m(".?", 0, status);
2259         REGEX_CHECK_STATUS;
2260         REGEX_ASSERT(m.find());
2261         REGEX_ASSERT(m.start(status) == 0);
2262         REGEX_ASSERT(m.input() == "");
2263     }
2264     {
2265         UErrorCode status = U_ZERO_ERROR;
2266         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2267         RegexMatcher  *m = p->matcher(status);
2268         REGEX_CHECK_STATUS;
2269
2270         REGEX_ASSERT(m->find() == FALSE);
2271         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2272         delete m;
2273         delete p;
2274     }
2275
2276     //
2277     // Regions
2278     //
2279     {
2280         UErrorCode status = U_ZERO_ERROR;
2281         UText testPattern = UTEXT_INITIALIZER;
2282         UText testText    = UTEXT_INITIALIZER;
2283         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2284         REGEX_VERBOSE_TEXT(&testPattern);
2285         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2286         REGEX_VERBOSE_TEXT(&testText);
2287
2288         RegexMatcher m(&testPattern, &testText, 0, status);
2289         REGEX_CHECK_STATUS;
2290         REGEX_ASSERT(m.regionStart() == 0);
2291         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2292         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2293         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2294
2295         m.region(2,4, status);
2296         REGEX_CHECK_STATUS;
2297         REGEX_ASSERT(m.matches(status));
2298         REGEX_ASSERT(m.start(status)==2);
2299         REGEX_ASSERT(m.end(status)==4);
2300         REGEX_CHECK_STATUS;
2301
2302         m.reset();
2303         REGEX_ASSERT(m.regionStart() == 0);
2304         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2305
2306         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2307         REGEX_VERBOSE_TEXT(&testText);
2308         m.reset(&testText);
2309         REGEX_ASSERT(m.regionStart() == 0);
2310         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2311
2312         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2313         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2314         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2315         REGEX_ASSERT(&m == &m.reset());
2316         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2317
2318         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2319         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2320         REGEX_ASSERT(&m == &m.reset());
2321         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2322
2323         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2324         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2325         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2326         REGEX_ASSERT(&m == &m.reset());
2327         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2328
2329         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2330         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2331         REGEX_ASSERT(&m == &m.reset());
2332         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2333
2334         utext_close(&testText);
2335         utext_close(&testPattern);
2336     }
2337
2338     //
2339     // hitEnd() and requireEnd()
2340     //
2341     {
2342         UErrorCode status = U_ZERO_ERROR;
2343         UText testPattern = UTEXT_INITIALIZER;
2344         UText testText    = UTEXT_INITIALIZER;
2345         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2346         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2347         utext_openUTF8(&testPattern, str_, -1, &status);
2348         utext_openUTF8(&testText, str_aabb, -1, &status);
2349
2350         RegexMatcher m1(&testPattern, &testText,  0, status);
2351         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2352         REGEX_ASSERT(m1.hitEnd() == TRUE);
2353         REGEX_ASSERT(m1.requireEnd() == FALSE);
2354         REGEX_CHECK_STATUS;
2355
2356         status = U_ZERO_ERROR;
2357         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2358         utext_openUTF8(&testPattern, str_a, -1, &status);
2359         RegexMatcher m2(&testPattern, &testText, 0, status);
2360         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2361         REGEX_ASSERT(m2.hitEnd() == FALSE);
2362         REGEX_ASSERT(m2.requireEnd() == FALSE);
2363         REGEX_CHECK_STATUS;
2364
2365         status = U_ZERO_ERROR;
2366         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2367         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2368         RegexMatcher m3(&testPattern, &testText, 0, status);
2369         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2370         REGEX_ASSERT(m3.hitEnd() == TRUE);
2371         REGEX_ASSERT(m3.requireEnd() == TRUE);
2372         REGEX_CHECK_STATUS;
2373
2374         utext_close(&testText);
2375         utext_close(&testPattern);
2376     }
2377 }
2378
2379
2380 //---------------------------------------------------------------------------
2381 //
2382 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2383 //                         Replace family of functions.
2384 //
2385 //---------------------------------------------------------------------------
2386 void RegexTest::API_Replace_UTF8() {
2387     //
2388     //  Replace
2389     //
2390     int32_t             flags=0;
2391     UParseError         pe;
2392     UErrorCode          status=U_ZERO_ERROR;
2393
2394     UText               re=UTEXT_INITIALIZER;
2395     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2396     REGEX_VERBOSE_TEXT(&re);
2397     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2398     REGEX_CHECK_STATUS;
2399
2400     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2401     //             012345678901234567
2402     UText dataText = UTEXT_INITIALIZER;
2403     utext_openUTF8(&dataText, data, -1, &status);
2404     REGEX_CHECK_STATUS;
2405     REGEX_VERBOSE_TEXT(&dataText);
2406     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2407
2408     //
2409     //  Plain vanilla matches.
2410     //
2411     UnicodeString  dest;
2412     UText destText = UTEXT_INITIALIZER;
2413     utext_openUnicodeString(&destText, &dest, &status);
2414     UText *result;
2415
2416     UText replText = UTEXT_INITIALIZER;
2417
2418     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2419     utext_openUTF8(&replText, str_yz, -1, &status);
2420     REGEX_VERBOSE_TEXT(&replText);
2421     result = matcher->replaceFirst(&replText, NULL, status);
2422     REGEX_CHECK_STATUS;
2423     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2424     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2425     utext_close(result);
2426     result = matcher->replaceFirst(&replText, &destText, status);
2427     REGEX_CHECK_STATUS;
2428     REGEX_ASSERT(result == &destText);
2429     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2430
2431     result = matcher->replaceAll(&replText, NULL, status);
2432     REGEX_CHECK_STATUS;
2433     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2434     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2435     utext_close(result);
2436
2437     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2438     result = matcher->replaceAll(&replText, &destText, status);
2439     REGEX_CHECK_STATUS;
2440     REGEX_ASSERT(result == &destText);
2441     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2442
2443     //
2444     //  Plain vanilla non-matches.
2445     //
2446     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2447     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2448     matcher->reset(&dataText);
2449
2450     result = matcher->replaceFirst(&replText, NULL, status);
2451     REGEX_CHECK_STATUS;
2452     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2453     utext_close(result);
2454     result = matcher->replaceFirst(&replText, &destText, status);
2455     REGEX_CHECK_STATUS;
2456     REGEX_ASSERT(result == &destText);
2457     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2458
2459     result = matcher->replaceAll(&replText, NULL, status);
2460     REGEX_CHECK_STATUS;
2461     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2462     utext_close(result);
2463     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2464     result = matcher->replaceAll(&replText, &destText, status);
2465     REGEX_CHECK_STATUS;
2466     REGEX_ASSERT(result == &destText);
2467     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2468
2469     //
2470     // Empty source string
2471     //
2472     utext_openUTF8(&dataText, NULL, 0, &status);
2473     matcher->reset(&dataText);
2474
2475     result = matcher->replaceFirst(&replText, NULL, status);
2476     REGEX_CHECK_STATUS;
2477     REGEX_ASSERT_UTEXT_UTF8("", result);
2478     utext_close(result);
2479     result = matcher->replaceFirst(&replText, &destText, status);
2480     REGEX_CHECK_STATUS;
2481     REGEX_ASSERT(result == &destText);
2482     REGEX_ASSERT_UTEXT_UTF8("", result);
2483
2484     result = matcher->replaceAll(&replText, NULL, status);
2485     REGEX_CHECK_STATUS;
2486     REGEX_ASSERT_UTEXT_UTF8("", result);
2487     utext_close(result);
2488     result = matcher->replaceAll(&replText, &destText, status);
2489     REGEX_CHECK_STATUS;
2490     REGEX_ASSERT(result == &destText);
2491     REGEX_ASSERT_UTEXT_UTF8("", result);
2492
2493     //
2494     // Empty substitution string
2495     //
2496     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2497     matcher->reset(&dataText);
2498
2499     utext_openUTF8(&replText, NULL, 0, &status);
2500     result = matcher->replaceFirst(&replText, NULL, status);
2501     REGEX_CHECK_STATUS;
2502     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2503     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2504     utext_close(result);
2505     result = matcher->replaceFirst(&replText, &destText, status);
2506     REGEX_CHECK_STATUS;
2507     REGEX_ASSERT(result == &destText);
2508     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2509
2510     result = matcher->replaceAll(&replText, NULL, status);
2511     REGEX_CHECK_STATUS;
2512     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2513     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2514     utext_close(result);
2515     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2516     result = matcher->replaceAll(&replText, &destText, status);
2517     REGEX_CHECK_STATUS;
2518     REGEX_ASSERT(result == &destText);
2519     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2520
2521     //
2522     // match whole string
2523     //
2524     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2525     utext_openUTF8(&dataText, str_abc, -1, &status);
2526     matcher->reset(&dataText);
2527
2528     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2529     utext_openUTF8(&replText, str_xyz, -1, &status);
2530     result = matcher->replaceFirst(&replText, NULL, status);
2531     REGEX_CHECK_STATUS;
2532     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2533     utext_close(result);
2534     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2535     result = matcher->replaceFirst(&replText, &destText, status);
2536     REGEX_CHECK_STATUS;
2537     REGEX_ASSERT(result == &destText);
2538     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539
2540     result = matcher->replaceAll(&replText, NULL, status);
2541     REGEX_CHECK_STATUS;
2542     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2543     utext_close(result);
2544     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2545     result = matcher->replaceAll(&replText, &destText, status);
2546     REGEX_CHECK_STATUS;
2547     REGEX_ASSERT(result == &destText);
2548     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2549
2550     //
2551     // Capture Group, simple case
2552     //
2553     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2554     utext_openUTF8(&re, str_add, -1, &status);
2555     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2556     REGEX_CHECK_STATUS;
2557
2558     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2559     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2560     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2561     REGEX_CHECK_STATUS;
2562
2563     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2564     utext_openUTF8(&replText, str_11, -1, &status);
2565     result = matcher2->replaceFirst(&replText, NULL, status);
2566     REGEX_CHECK_STATUS;
2567     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2568     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2569     utext_close(result);
2570     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2571     result = matcher2->replaceFirst(&replText, &destText, status);
2572     REGEX_CHECK_STATUS;
2573     REGEX_ASSERT(result == &destText);
2574     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2575
2576     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2577     utext_openUTF8(&replText, str_v, -1, &status);
2578     REGEX_VERBOSE_TEXT(&replText);
2579     result = matcher2->replaceFirst(&replText, NULL, status);
2580     REGEX_CHECK_STATUS;
2581     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2582     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2583     utext_close(result);
2584     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2585     result = matcher2->replaceFirst(&replText, &destText, status);
2586     REGEX_CHECK_STATUS;
2587     REGEX_ASSERT(result == &destText);
2588     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2589
2590     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2591                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2592                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2593     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2594     result = matcher2->replaceFirst(&replText, NULL, status);
2595     REGEX_CHECK_STATUS;
2596     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2597     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2598     utext_close(result);
2599     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2600     result = matcher2->replaceFirst(&replText, &destText, status);
2601     REGEX_CHECK_STATUS;
2602     REGEX_ASSERT(result == &destText);
2603     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2604
2605     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2606     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2607     //                                 012345678901234567890123456
2608     supplDigitChars[22] = 0xF0;
2609     supplDigitChars[23] = 0x9D;
2610     supplDigitChars[24] = 0x9F;
2611     supplDigitChars[25] = 0x8F;
2612     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2613
2614     result = matcher2->replaceFirst(&replText, NULL, status);
2615     REGEX_CHECK_STATUS;
2616     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2617     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2618     utext_close(result);
2619     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2620     result = matcher2->replaceFirst(&replText, &destText, status);
2621     REGEX_CHECK_STATUS;
2622     REGEX_ASSERT(result == &destText);
2623     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2624     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2625     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2626     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2627 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2628     utext_close(result);
2629     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2630     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2631     REGEX_ASSERT(result == &destText);
2632 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2633
2634     //
2635     // Replacement String with \u hex escapes
2636     //
2637     {
2638       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2639       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2640         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2641         utext_openUTF8(&replText, str_u0043, -1, &status);
2642         matcher->reset(&dataText);
2643
2644         result = matcher->replaceAll(&replText, NULL, status);
2645         REGEX_CHECK_STATUS;
2646         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2647         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2648         utext_close(result);
2649         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2650         result = matcher->replaceAll(&replText, &destText, status);
2651         REGEX_CHECK_STATUS;
2652         REGEX_ASSERT(result == &destText);
2653         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2654     }
2655     {
2656       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2657         utext_openUTF8(&dataText, str_abc, -1, &status);
2658         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2659         utext_openUTF8(&replText, str_U00010000, -1, &status);
2660         matcher->reset(&dataText);
2661
2662         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2663         //                          0123456789
2664         expected[2] = 0xF0;
2665         expected[3] = 0x90;
2666         expected[4] = 0x80;
2667         expected[5] = 0x80;
2668
2669         result = matcher->replaceAll(&replText, NULL, status);
2670         REGEX_CHECK_STATUS;
2671         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2672         utext_close(result);
2673         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2674         result = matcher->replaceAll(&replText, &destText, status);
2675         REGEX_CHECK_STATUS;
2676         REGEX_ASSERT(result == &destText);
2677         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2678     }
2679     // TODO:  need more through testing of capture substitutions.
2680
2681     // Bug 4057
2682     //
2683     {
2684         status = U_ZERO_ERROR;
2685 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2686 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2687 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2688         utext_openUTF8(&re, str_ssee, -1, &status);
2689         utext_openUTF8(&dataText, str_blah, -1, &status);
2690         utext_openUTF8(&replText, str_ooh, -1, &status);
2691
2692         RegexMatcher m(&re, 0, status);
2693         REGEX_CHECK_STATUS;
2694
2695         UnicodeString result;
2696         UText resultText = UTEXT_INITIALIZER;
2697         utext_openUnicodeString(&resultText, &result, &status);
2698
2699         // Multiple finds do NOT bump up the previous appendReplacement postion.
2700         m.reset(&dataText);
2701         m.find();
2702         m.find();
2703         m.appendReplacement(&resultText, &replText, status);
2704         REGEX_CHECK_STATUS;
2705         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2706         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2707
2708         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2709         status = U_ZERO_ERROR;
2710         result.truncate(0);
2711         utext_openUnicodeString(&resultText, &result, &status);
2712         m.reset(10, status);
2713         m.find();
2714         m.find();
2715         m.appendReplacement(&resultText, &replText, status);
2716         REGEX_CHECK_STATUS;
2717         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2718         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2719
2720         // find() at interior of string, appendReplacement still starts at beginning.
2721         status = U_ZERO_ERROR;
2722         result.truncate(0);
2723         utext_openUnicodeString(&resultText, &result, &status);
2724         m.reset();
2725         m.find(10, status);
2726         m.find();
2727         m.appendReplacement(&resultText, &replText, status);
2728         REGEX_CHECK_STATUS;
2729         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2730         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2731
2732         m.appendTail(&resultText, status);
2733         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2734         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2735
2736         utext_close(&resultText);
2737     }
2738
2739     delete matcher2;
2740     delete pat2;
2741     delete matcher;
2742     delete pat;
2743
2744     utext_close(&dataText);
2745     utext_close(&replText);
2746     utext_close(&destText);
2747     utext_close(&re);
2748 }
2749
2750
2751 //---------------------------------------------------------------------------
2752 //
2753 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2754 //                        present and nominally working.
2755 //
2756 //---------------------------------------------------------------------------
2757 void RegexTest::API_Pattern_UTF8() {
2758     RegexPattern        pata;    // Test default constructor to not crash.
2759     RegexPattern        patb;
2760
2761     REGEX_ASSERT(pata == patb);
2762     REGEX_ASSERT(pata == pata);
2763
2764     UText         re1 = UTEXT_INITIALIZER;
2765     UText         re2 = UTEXT_INITIALIZER;
2766     UErrorCode    status = U_ZERO_ERROR;
2767     UParseError   pe;
2768
2769     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2770     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2771     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2772     utext_openUTF8(&re2, str_def, -1, &status);
2773
2774     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2775     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2776     REGEX_CHECK_STATUS;
2777     REGEX_ASSERT(*pat1 == *pat1);
2778     REGEX_ASSERT(*pat1 != pata);
2779
2780     // Assign
2781     patb = *pat1;
2782     REGEX_ASSERT(patb == *pat1);
2783
2784     // Copy Construct
2785     RegexPattern patc(*pat1);
2786     REGEX_ASSERT(patc == *pat1);
2787     REGEX_ASSERT(patb == patc);
2788     REGEX_ASSERT(pat1 != pat2);
2789     patb = *pat2;
2790     REGEX_ASSERT(patb != patc);
2791     REGEX_ASSERT(patb == *pat2);
2792
2793     // Compile with no flags.
2794     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2795     REGEX_ASSERT(*pat1a == *pat1);
2796
2797     REGEX_ASSERT(pat1a->flags() == 0);
2798
2799     // Compile with different flags should be not equal
2800     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2801     REGEX_CHECK_STATUS;
2802
2803     REGEX_ASSERT(*pat1b != *pat1a);
2804     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2805     REGEX_ASSERT(pat1a->flags() == 0);
2806     delete pat1b;
2807
2808     // clone
2809     RegexPattern *pat1c = pat1->clone();
2810     REGEX_ASSERT(*pat1c == *pat1);
2811     REGEX_ASSERT(*pat1c != *pat2);
2812
2813     delete pat1c;
2814     delete pat1a;
2815     delete pat1;
2816     delete pat2;
2817
2818     utext_close(&re1);
2819     utext_close(&re2);
2820
2821
2822     //
2823     //   Verify that a matcher created from a cloned pattern works.
2824     //     (Jitterbug 3423)
2825     //
2826     {
2827         UErrorCode     status     = U_ZERO_ERROR;
2828         UText          pattern    = UTEXT_INITIALIZER;
2829         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2830         utext_openUTF8(&pattern, str_pL, -1, &status);
2831
2832         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2833         RegexPattern  *pClone     = pSource->clone();
2834         delete         pSource;
2835         RegexMatcher  *mFromClone = pClone->matcher(status);
2836         REGEX_CHECK_STATUS;
2837
2838         UText          input      = UTEXT_INITIALIZER;
2839         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2840         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2841         mFromClone->reset(&input);
2842         REGEX_ASSERT(mFromClone->find() == TRUE);
2843         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2844         REGEX_ASSERT(mFromClone->find() == TRUE);
2845         REGEX_ASSERT(mFromClone->group(status) == "World");
2846         REGEX_ASSERT(mFromClone->find() == FALSE);
2847         delete mFromClone;
2848         delete pClone;
2849
2850         utext_close(&input);
2851         utext_close(&pattern);
2852     }
2853
2854     //
2855     //   matches convenience API
2856     //
2857     {
2858         UErrorCode status  = U_ZERO_ERROR;
2859         UText      pattern = UTEXT_INITIALIZER;
2860         UText      input   = UTEXT_INITIALIZER;
2861
2862         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2863         utext_openUTF8(&input, str_randominput, -1, &status);
2864
2865         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2866         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2867         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2868         REGEX_CHECK_STATUS;
2869
2870         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2871         utext_openUTF8(&pattern, str_abc, -1, &status);
2872         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2873         REGEX_CHECK_STATUS;
2874
2875         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2876         utext_openUTF8(&pattern, str_nput, -1, &status);
2877         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2878         REGEX_CHECK_STATUS;
2879
2880         utext_openUTF8(&pattern, str_randominput, -1, &status);
2881         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2882         REGEX_CHECK_STATUS;
2883
2884         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2885         utext_openUTF8(&pattern, str_u, -1, &status);
2886         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2887         REGEX_CHECK_STATUS;
2888
2889         utext_openUTF8(&input, str_abc, -1, &status);
2890         utext_openUTF8(&pattern, str_abc, -1, &status);
2891         status = U_INDEX_OUTOFBOUNDS_ERROR;
2892         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2893         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2894
2895         utext_close(&input);
2896         utext_close(&pattern);
2897     }
2898
2899
2900     //
2901     // Split()
2902     //
2903     status = U_ZERO_ERROR;
2904     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2905     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2906     pat1 = RegexPattern::compile(&re1, pe, status);
2907     REGEX_CHECK_STATUS;
2908     UnicodeString  fields[10];
2909
2910     int32_t n;
2911     n = pat1->split("Now is the time", fields, 10, status);
2912     REGEX_CHECK_STATUS;
2913     REGEX_ASSERT(n==4);
2914     REGEX_ASSERT(fields[0]=="Now");
2915     REGEX_ASSERT(fields[1]=="is");
2916     REGEX_ASSERT(fields[2]=="the");
2917     REGEX_ASSERT(fields[3]=="time");
2918     REGEX_ASSERT(fields[4]=="");
2919
2920     n = pat1->split("Now is the time", fields, 2, status);
2921     REGEX_CHECK_STATUS;
2922     REGEX_ASSERT(n==2);
2923     REGEX_ASSERT(fields[0]=="Now");
2924     REGEX_ASSERT(fields[1]=="is the time");
2925     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2926
2927     fields[1] = "*";
2928     status = U_ZERO_ERROR;
2929     n = pat1->split("Now is the time", fields, 1, status);
2930     REGEX_CHECK_STATUS;
2931     REGEX_ASSERT(n==1);
2932     REGEX_ASSERT(fields[0]=="Now is the time");
2933     REGEX_ASSERT(fields[1]=="*");
2934     status = U_ZERO_ERROR;
2935
2936     n = pat1->split("    Now       is the time   ", fields, 10, status);
2937     REGEX_CHECK_STATUS;
2938     REGEX_ASSERT(n==6);
2939     REGEX_ASSERT(fields[0]=="");
2940     REGEX_ASSERT(fields[1]=="Now");
2941     REGEX_ASSERT(fields[2]=="is");
2942     REGEX_ASSERT(fields[3]=="the");
2943     REGEX_ASSERT(fields[4]=="time");
2944     REGEX_ASSERT(fields[5]=="");
2945     REGEX_ASSERT(fields[6]=="");
2946
2947     fields[2] = "*";
2948     n = pat1->split("     ", fields, 10, status);
2949     REGEX_CHECK_STATUS;
2950     REGEX_ASSERT(n==2);
2951     REGEX_ASSERT(fields[0]=="");
2952     REGEX_ASSERT(fields[1]=="");
2953     REGEX_ASSERT(fields[2]=="*");
2954
2955     fields[0] = "foo";
2956     n = pat1->split("", fields, 10, status);
2957     REGEX_CHECK_STATUS;
2958     REGEX_ASSERT(n==0);
2959     REGEX_ASSERT(fields[0]=="foo");
2960
2961     delete pat1;
2962
2963     //  split, with a pattern with (capture)
2964     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2965     pat1 = RegexPattern::compile(&re1,  pe, status);
2966     REGEX_CHECK_STATUS;
2967
2968     status = U_ZERO_ERROR;
2969     fields[6] = fields[7] = "*";
2970     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2971     REGEX_CHECK_STATUS;
2972     REGEX_ASSERT(n==7);
2973     REGEX_ASSERT(fields[0]=="");
2974     REGEX_ASSERT(fields[1]=="a");
2975     REGEX_ASSERT(fields[2]=="Now is ");
2976     REGEX_ASSERT(fields[3]=="b");
2977     REGEX_ASSERT(fields[4]=="the time");
2978     REGEX_ASSERT(fields[5]=="c");
2979     REGEX_ASSERT(fields[6]=="");
2980     REGEX_ASSERT(fields[7]=="*");
2981     REGEX_ASSERT(status==U_ZERO_ERROR);
2982
2983     fields[6] = fields[7] = "*";
2984     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2985     REGEX_CHECK_STATUS;
2986     REGEX_ASSERT(n==7);
2987     REGEX_ASSERT(fields[0]=="  ");
2988     REGEX_ASSERT(fields[1]=="a");
2989     REGEX_ASSERT(fields[2]=="Now is ");
2990     REGEX_ASSERT(fields[3]=="b");
2991     REGEX_ASSERT(fields[4]=="the time");
2992     REGEX_ASSERT(fields[5]=="c");
2993     REGEX_ASSERT(fields[6]=="");
2994     REGEX_ASSERT(fields[7]=="*");
2995
2996     status = U_ZERO_ERROR;
2997     fields[6] = "foo";
2998     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2999     REGEX_CHECK_STATUS;
3000     REGEX_ASSERT(n==6);
3001     REGEX_ASSERT(fields[0]=="  ");
3002     REGEX_ASSERT(fields[1]=="a");
3003     REGEX_ASSERT(fields[2]=="Now is ");
3004     REGEX_ASSERT(fields[3]=="b");
3005     REGEX_ASSERT(fields[4]=="the time");
3006     REGEX_ASSERT(fields[5]==" ");
3007     REGEX_ASSERT(fields[6]=="foo");
3008
3009     status = U_ZERO_ERROR;
3010     fields[5] = "foo";
3011     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3012     REGEX_CHECK_STATUS;
3013     REGEX_ASSERT(n==5);
3014     REGEX_ASSERT(fields[0]=="  ");
3015     REGEX_ASSERT(fields[1]=="a");
3016     REGEX_ASSERT(fields[2]=="Now is ");
3017     REGEX_ASSERT(fields[3]=="b");
3018     REGEX_ASSERT(fields[4]=="the time<c>");
3019     REGEX_ASSERT(fields[5]=="foo");
3020
3021     status = U_ZERO_ERROR;
3022     fields[5] = "foo";
3023     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3024     REGEX_CHECK_STATUS;
3025     REGEX_ASSERT(n==5);
3026     REGEX_ASSERT(fields[0]=="  ");
3027     REGEX_ASSERT(fields[1]=="a");
3028     REGEX_ASSERT(fields[2]=="Now is ");
3029     REGEX_ASSERT(fields[3]=="b");
3030     REGEX_ASSERT(fields[4]=="the time");
3031     REGEX_ASSERT(fields[5]=="foo");
3032
3033     status = U_ZERO_ERROR;
3034     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3035     REGEX_CHECK_STATUS;
3036     REGEX_ASSERT(n==4);
3037     REGEX_ASSERT(fields[0]=="  ");
3038     REGEX_ASSERT(fields[1]=="a");
3039     REGEX_ASSERT(fields[2]=="Now is ");
3040     REGEX_ASSERT(fields[3]=="the time<c>");
3041     status = U_ZERO_ERROR;
3042     delete pat1;
3043
3044     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3045     pat1 = RegexPattern::compile(&re1, pe, status);
3046     REGEX_CHECK_STATUS;
3047     n = pat1->split("1-10,20", fields, 10, status);
3048     REGEX_CHECK_STATUS;
3049     REGEX_ASSERT(n==5);
3050     REGEX_ASSERT(fields[0]=="1");
3051     REGEX_ASSERT(fields[1]=="-");
3052     REGEX_ASSERT(fields[2]=="10");
3053     REGEX_ASSERT(fields[3]==",");
3054     REGEX_ASSERT(fields[4]=="20");
3055     delete pat1;
3056
3057
3058     //
3059     // split of a UText based string, with library allocating output UTexts.
3060     //
3061     {
3062         status = U_ZERO_ERROR;
3063         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3064         UnicodeString stringToSplit("first:second:third");
3065         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3066         REGEX_CHECK_STATUS;
3067
3068         UText *splits[10] = {NULL};
3069         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3070         REGEX_CHECK_STATUS;
3071         REGEX_ASSERT(numFields == 5);
3072         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3073         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3074         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3075         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3076         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3077         REGEX_ASSERT(splits[5] == NULL);
3078
3079         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3080             if (splits[i]) {
3081                 utext_close(splits[i]);
3082                 splits[i] = NULL;
3083             }
3084         }
3085         utext_close(textToSplit);
3086     }
3087
3088
3089     //
3090     // RegexPattern::pattern() and patternText()
3091     //
3092     pat1 = new RegexPattern();
3093     REGEX_ASSERT(pat1->pattern() == "");
3094     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3095     delete pat1;
3096     const char *helloWorldInvariant = "(Hello, world)*";
3097     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3098     pat1 = RegexPattern::compile(&re1, pe, status);
3099     REGEX_CHECK_STATUS;
3100     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3101     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3102     delete pat1;
3103
3104     utext_close(&re1);
3105 }
3106
3107
3108 //---------------------------------------------------------------------------
3109 //
3110 //      Extended       A more thorough check for features of regex patterns
3111 //                     The test cases are in a separate data file,
3112 //                       source/tests/testdata/regextst.txt
3113 //                     A description of the test data format is included in that file.
3114 //
3115 //---------------------------------------------------------------------------
3116
3117 const char *
3118 RegexTest::getPath(char buffer[2048], const char *filename) {
3119     UErrorCode status=U_ZERO_ERROR;
3120     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3121     if (U_FAILURE(status)) {
3122         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3123         return NULL;
3124     }
3125
3126     strcpy(buffer, testDataDirectory);
3127     strcat(buffer, filename);
3128     return buffer;
3129 }
3130
3131 void RegexTest::Extended() {
3132     char tdd[2048];
3133     const char *srcPath;
3134     UErrorCode  status  = U_ZERO_ERROR;
3135     int32_t     lineNum = 0;
3136
3137     //
3138     //  Open and read the test data file.
3139     //
3140     srcPath=getPath(tdd, "regextst.txt");
3141     if(srcPath==NULL) {
3142         return; /* something went wrong, error already output */
3143     }
3144
3145     int32_t    len;
3146     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3147     if (U_FAILURE(status)) {
3148         return; /* something went wrong, error already output */
3149     }
3150
3151     //
3152     //  Put the test data into a UnicodeString
3153     //
3154     UnicodeString testString(FALSE, testData, len);
3155
3156     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3157     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3158     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3159
3160     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3161     UnicodeString   testPattern;   // The pattern for test from the test file.
3162     UnicodeString   testFlags;     // the flags   for a test.
3163     UnicodeString   matchString;   // The marked up string to be used as input
3164
3165     if (U_FAILURE(status)){
3166         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3167         delete [] testData;
3168         return;
3169     }
3170
3171     //
3172     //  Loop over the test data file, once per line.
3173     //
3174     while (lineMat.find()) {
3175         lineNum++;
3176         if (U_FAILURE(status)) {
3177           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3178         }
3179
3180         status = U_ZERO_ERROR;
3181         UnicodeString testLine = lineMat.group(1, status);
3182         if (testLine.length() == 0) {
3183             continue;
3184         }
3185
3186         //
3187         // Parse the test line.  Skip blank and comment only lines.
3188         // Separate out the three main fields - pattern, flags, target.
3189         //
3190
3191         commentMat.reset(testLine);
3192         if (commentMat.lookingAt(status)) {
3193             // This line is a comment, or blank.
3194             continue;
3195         }
3196
3197         //
3198         //  Pull out the pattern field, remove it from the test file line.
3199         //
3200         quotedStuffMat.reset(testLine);
3201         if (quotedStuffMat.lookingAt(status)) {
3202             testPattern = quotedStuffMat.group(2, status);
3203             testLine.remove(0, quotedStuffMat.end(0, status));
3204         } else {
3205             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3206             continue;
3207         }
3208
3209
3210         //
3211         //  Pull out the flags from the test file line.
3212         //
3213         flagsMat.reset(testLine);
3214         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3215         testFlags = flagsMat.group(1, status);
3216         if (flagsMat.group(2, status).length() > 0) {
3217             errln("Bad Match flag at line %d. Scanning %c\n",
3218                 lineNum, flagsMat.group(2, status).charAt(0));
3219             continue;
3220         }
3221         testLine.remove(0, flagsMat.end(0, status));
3222
3223         //
3224         //  Pull out the match string, as a whole.
3225         //    We'll process the <tags> later.
3226         //
3227         quotedStuffMat.reset(testLine);
3228         if (quotedStuffMat.lookingAt(status)) {
3229             matchString = quotedStuffMat.group(2, status);
3230             testLine.remove(0, quotedStuffMat.end(0, status));
3231         } else {
3232             errln("Bad match string at test file line %d", lineNum);
3233             continue;
3234         }
3235
3236         //
3237         //  The only thing left from the input line should be an optional trailing comment.
3238         //
3239         commentMat.reset(testLine);
3240         if (commentMat.lookingAt(status) == FALSE) {
3241             errln("Line %d: unexpected characters at end of test line.", lineNum);
3242             continue;
3243         }
3244
3245         //
3246         //  Run the test
3247         //
3248         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3249     }
3250
3251     delete [] testData;
3252
3253 }
3254
3255
3256
3257 //---------------------------------------------------------------------------
3258 //
3259 //    regex_find(pattern, flags, inputString, lineNumber)
3260 //
3261 //         Function to run a single test from the Extended (data driven) tests.
3262 //         See file test/testdata/regextst.txt for a description of the
3263 //         pattern and inputString fields, and the allowed flags.
3264 //         lineNumber is the source line in regextst.txt of the test.
3265 //
3266 //---------------------------------------------------------------------------
3267
3268
3269 //  Set a value into a UVector at position specified by a decimal number in
3270 //   a UnicodeString.   This is a utility function needed by the actual test function,
3271 //   which follows.
3272 static void set(UVector &vec, int32_t val, UnicodeString index) {
3273     UErrorCode  status=U_ZERO_ERROR;
3274     int32_t  idx = 0;
3275     for (int32_t i=0; i<index.length(); i++) {
3276         int32_t d=u_charDigitValue(index.charAt(i));
3277         if (d<0) {return;}
3278         idx = idx*10 + d;
3279     }
3280     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3281     vec.setElementAt(val, idx);
3282 }
3283
3284 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3285     UErrorCode  status=U_ZERO_ERROR;
3286     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3287     vec.setElementAt(val, idx);
3288 }
3289
3290 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3291 {
3292     UBool couldFind = TRUE;
3293     UTEXT_SETNATIVEINDEX(utext, 0);
3294     int32_t i = 0;
3295     while (i < unistrOffset) {
3296         UChar32 c = UTEXT_NEXT32(utext);
3297         if (c != U_SENTINEL) {
3298             i += U16_LENGTH(c);
3299         } else {
3300             couldFind = FALSE;
3301             break;
3302         }
3303     }
3304     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3305     return couldFind;
3306 }
3307
3308
3309 void RegexTest::regex_find(const UnicodeString &pattern,
3310                            const UnicodeString &flags,
3311                            const UnicodeString &inputString,
3312                            const char *srcPath,
3313                            int32_t line) {
3314     UnicodeString       unEscapedInput;
3315     UnicodeString       deTaggedInput;
3316
3317     int32_t             patternUTF8Length,      inputUTF8Length;
3318     char                *patternChars  = NULL, *inputChars = NULL;
3319     UText               patternText    = UTEXT_INITIALIZER;
3320     UText               inputText      = UTEXT_INITIALIZER;
3321     UConverter          *UTF8Converter = NULL;
3322
3323     UErrorCode          status         = U_ZERO_ERROR;
3324     UParseError         pe;
3325     RegexPattern        *parsePat      = NULL;
3326     RegexMatcher        *parseMatcher  = NULL;
3327     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3328     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3329     UVector             groupStarts(status);
3330     UVector             groupEnds(status);
3331     UVector             groupStartsUTF8(status);
3332     UVector             groupEndsUTF8(status);
3333     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3334     UBool               failed         = FALSE;
3335     int32_t             numFinds;
3336     int32_t             i;
3337     UBool               useMatchesFunc   = FALSE;
3338     UBool               useLookingAtFunc = FALSE;
3339     int32_t             regionStart      = -1;
3340     int32_t             regionEnd        = -1;
3341     int32_t             regionStartUTF8  = -1;
3342     int32_t             regionEndUTF8    = -1;
3343
3344
3345     //
3346     //  Compile the caller's pattern
3347     //
3348     uint32_t bflags = 0;
3349     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3350         bflags |= UREGEX_CASE_INSENSITIVE;
3351     }
3352     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3353         bflags |= UREGEX_COMMENTS;
3354     }
3355     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3356         bflags |= UREGEX_DOTALL;
3357     }
3358     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3359         bflags |= UREGEX_MULTILINE;
3360     }
3361
3362     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3363         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3364     }
3365     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3366         bflags |= UREGEX_UNIX_LINES;
3367     }
3368     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3369         bflags |= UREGEX_LITERAL;
3370     }
3371
3372
3373     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3374     if (status != U_ZERO_ERROR) {
3375         #if UCONFIG_NO_BREAK_ITERATION==1
3376         // 'v' test flag means that the test pattern should not compile if ICU was configured
3377         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3378         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3379             goto cleanupAndReturn;
3380         }
3381         #endif
3382         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3383             // Expected pattern compilation error.
3384             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3385                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3386             }
3387             goto cleanupAndReturn;
3388         } else {
3389             // Unexpected pattern compilation error.
3390             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3391             goto cleanupAndReturn;
3392         }
3393     }
3394
3395     UTF8Converter = ucnv_open("UTF8", &status);
3396     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3397
3398     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3399     status = U_ZERO_ERROR; // buffer overflow
3400     patternChars = new char[patternUTF8Length+1];
3401     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3402     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3403
3404     if (status == U_ZERO_ERROR) {
3405         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3406
3407         if (status != U_ZERO_ERROR) {
3408 #if UCONFIG_NO_BREAK_ITERATION==1
3409             // 'v' test flag means that the test pattern should not compile if ICU was configured
3410             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3411             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3412                 goto cleanupAndReturn;
3413             }
3414 #endif
3415             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3416                 // Expected pattern compilation error.
3417                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3418                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3419                 }
3420                 goto cleanupAndReturn;
3421             } else {
3422                 // Unexpected pattern compilation error.
3423                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3424                 goto cleanupAndReturn;
3425             }
3426         }
3427     }
3428
3429     if (UTF8Pattern == NULL) {
3430         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3431         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3432         status = U_ZERO_ERROR;
3433     }
3434
3435     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3436         callerPattern->dumpPattern();
3437     }
3438
3439     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3440         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3441         goto cleanupAndReturn;
3442     }
3443
3444
3445     //
3446     // Number of times find() should be called on the test string, default to 1
3447     //
3448     numFinds = 1;
3449     for (i=2; i<=9; i++) {
3450         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3451             if (numFinds != 1) {
3452                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3453                 goto cleanupAndReturn;
3454             }
3455             numFinds = i;
3456         }
3457     }
3458
3459     // 'M' flag.  Use matches() instead of find()
3460     if (flags.indexOf((UChar)0x4d) >= 0) {
3461         useMatchesFunc = TRUE;
3462     }
3463     if (flags.indexOf((UChar)0x4c) >= 0) {
3464         useLookingAtFunc = TRUE;
3465     }
3466
3467     //
3468     //  Find the tags in the input data, remove them, and record the group boundary
3469     //    positions.
3470     //
3471     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3472     REGEX_CHECK_STATUS_L(line);
3473
3474     unEscapedInput = inputString.unescape();
3475     parseMatcher = parsePat->matcher(unEscapedInput, status);
3476     REGEX_CHECK_STATUS_L(line);
3477     while(parseMatcher->find()) {
3478         parseMatcher->appendReplacement(deTaggedInput, "", status);
3479         REGEX_CHECK_STATUS;
3480         UnicodeString groupNum = parseMatcher->group(2, status);
3481         if (groupNum == "r") {
3482             // <r> or </r>, a region specification within the string
3483             if (parseMatcher->group(1, status) == "/") {
3484                 regionEnd = deTaggedInput.length();
3485             } else {
3486                 regionStart = deTaggedInput.length();
3487             }
3488         } else {
3489             // <digits> or </digits>, a group match boundary tag.
3490             if (parseMatcher->group(1, status) == "/") {
3491                 set(groupEnds, deTaggedInput.length(), groupNum);
3492             } else {
3493                 set(groupStarts, deTaggedInput.length(), groupNum);
3494             }
3495         }
3496     }
3497     parseMatcher->appendTail(deTaggedInput);
3498     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3499     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3500       errln("mismatched <r> tags");
3501       failed = TRUE;
3502       goto cleanupAndReturn;
3503     }
3504
3505     //
3506     //  Configure the matcher according to the flags specified with this test.
3507     //
3508     matcher = callerPattern->matcher(deTaggedInput, status);
3509     REGEX_CHECK_STATUS_L(line);
3510     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3511         matcher->setTrace(TRUE);
3512     }
3513
3514     if (UTF8Pattern != NULL) {
3515         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3516         status = U_ZERO_ERROR; // buffer overflow
3517         inputChars = new char[inputUTF8Length+1];
3518         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3519         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3520
3521         if (status == U_ZERO_ERROR) {
3522             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3523             REGEX_CHECK_STATUS_L(line);
3524         }
3525
3526         if (UTF8Matcher == NULL) {
3527             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3528             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3529             status = U_ZERO_ERROR;
3530         }
3531     }
3532
3533     //
3534     //  Generate native indices for UTF8 versions of region and capture group info
3535     //
3536     if (UTF8Matcher != NULL) {
3537         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3538             UTF8Matcher->setTrace(TRUE);
3539         }
3540         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3541         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3542
3543         //  Fill out the native index UVector info.
3544         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3545         for (i=0; i<groupStarts.size(); i++) {
3546             int32_t  start = groupStarts.elementAti(i);
3547             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3548             if (start >= 0) {
3549                 int32_t  startUTF8;
3550                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3551                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3552                     failed = TRUE;
3553                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3554                 }
3555                 setInt(groupStartsUTF8, startUTF8, i);
3556             }
3557
3558             int32_t  end = groupEnds.elementAti(i);
3559             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3560             if (end >= 0) {
3561                 int32_t  endUTF8;
3562                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3563                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3564                     failed = TRUE;
3565                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3566                 }
3567                 setInt(groupEndsUTF8, endUTF8, i);
3568             }
3569         }
3570     }
3571
3572     if (regionStart>=0) {
3573        matcher->region(regionStart, regionEnd, status);
3574        REGEX_CHECK_STATUS_L(line);
3575        if (UTF8Matcher != NULL) {
3576            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3577            REGEX_CHECK_STATUS_L(line);
3578        }
3579     }
3580     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3581         matcher->useAnchoringBounds(FALSE);
3582         if (UTF8Matcher != NULL) {
3583             UTF8Matcher->useAnchoringBounds(FALSE);
3584         }
3585     }
3586     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3587         matcher->useTransparentBounds(TRUE);
3588         if (UTF8Matcher != NULL) {
3589             UTF8Matcher->useTransparentBounds(TRUE);
3590         }
3591     }
3592
3593
3594
3595     //
3596     // Do a find on the de-tagged input using the caller's pattern
3597     //     TODO: error on count>1 and not find().
3598     //           error on both matches() and lookingAt().
3599     //
3600     for (i=0; i<numFinds; i++) {
3601         if (useMatchesFunc) {
3602             isMatch = matcher->matches(status);
3603             if (UTF8Matcher != NULL) {
3604                isUTF8Match = UTF8Matcher->matches(status);
3605             }
3606         } else  if (useLookingAtFunc) {
3607             isMatch = matcher->lookingAt(status);
3608             if (UTF8Matcher != NULL) {
3609                 isUTF8Match = UTF8Matcher->lookingAt(status);
3610             }
3611         } else {
3612             isMatch = matcher->find();
3613             if (UTF8Matcher != NULL) {
3614                 isUTF8Match = UTF8Matcher->find();
3615             }
3616         }
3617     }
3618     matcher->setTrace(FALSE);
3619     if (UTF8Matcher) {
3620         UTF8Matcher->setTrace(FALSE);
3621     }
3622     if (U_FAILURE(status)) {
3623         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3624     }
3625
3626     //
3627     // Match up the groups from the find() with the groups from the tags
3628     //
3629
3630     // number of tags should match number of groups from find operation.
3631     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3632     //   G option in test means that capture group data is not available in the
3633     //     expected results, so the check needs to be suppressed.
3634     if (isMatch == FALSE && groupStarts.size() != 0) {
3635         dataerrln("Error at line %d:  Match expected, but none found.", line);
3636         failed = TRUE;
3637         goto cleanupAndReturn;
3638     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3639         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3640         failed = TRUE;
3641         goto cleanupAndReturn;
3642     }
3643     if (isMatch && groupStarts.size() == 0) {
3644         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3645         failed = TRUE;
3646     }
3647     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3648         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3649         failed = TRUE;
3650     }
3651
3652     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3653         // Only check for match / no match.  Don't check capture groups.
3654         goto cleanupAndReturn;
3655     }
3656
3657     REGEX_CHECK_STATUS_L(line);
3658     for (i=0; i<=matcher->groupCount(); i++) {
3659         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3660         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3661         if (matcher->start(i, status) != expectedStart) {
3662             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3663                 line, i, expectedStart, matcher->start(i, status));
3664             failed = TRUE;
3665             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3666         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3667             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3668                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3669             failed = TRUE;
3670             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3671         }
3672
3673         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3674         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3675         if (matcher->end(i, status) != expectedEnd) {
3676             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3677                 line, i, expectedEnd, matcher->end(i, status));
3678             failed = TRUE;
3679             // Error on end position;  keep going; real error is probably yet to come as group
3680             //   end positions work from end of the input data towards the front.
3681         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3682             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3683                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3684             failed = TRUE;
3685             // Error on end position;  keep going; real error is probably yet to come as group
3686             //   end positions work from end of the input data towards the front.
3687         }
3688     }
3689     if ( matcher->groupCount()+1 < groupStarts.size()) {
3690         errln("Error at line %d: Expected %d capture groups, found %d.",
3691             line, groupStarts.size()-1, matcher->groupCount());
3692         failed = TRUE;
3693         }
3694     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3695         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3696               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3697         failed = TRUE;
3698     }
3699
3700     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3701         matcher->requireEnd() == TRUE) {
3702         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3703         failed = TRUE;
3704     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3705         UTF8Matcher->requireEnd() == TRUE) {
3706         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3707         failed = TRUE;
3708     }
3709
3710     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3711         matcher->requireEnd() == FALSE) {
3712         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3713         failed = TRUE;
3714     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3715         UTF8Matcher->requireEnd() == FALSE) {
3716         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3717         failed = TRUE;
3718     }
3719
3720     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3721         matcher->hitEnd() == TRUE) {
3722         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3723         failed = TRUE;
3724     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3725                UTF8Matcher->hitEnd() == TRUE) {
3726         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3727         failed = TRUE;
3728     }
3729
3730     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3731         matcher->hitEnd() == FALSE) {
3732         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3733         failed = TRUE;
3734     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3735                UTF8Matcher->hitEnd() == FALSE) {
3736         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3737         failed = TRUE;
3738     }
3739
3740
3741 cleanupAndReturn:
3742     if (failed) {
3743         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3744             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3745         // callerPattern->dump();
3746     }
3747     delete parseMatcher;
3748     delete parsePat;
3749     delete UTF8Matcher;
3750     delete UTF8Pattern;
3751     delete matcher;
3752     delete callerPattern;
3753
3754     utext_close(&inputText);
3755     delete[] inputChars;
3756     utext_close(&patternText);
3757     delete[] patternChars;
3758     ucnv_close(UTF8Converter);
3759 }
3760
3761
3762
3763
3764 //---------------------------------------------------------------------------
3765 //
3766 //      Errors     Check for error handling in patterns.
3767 //
3768 //---------------------------------------------------------------------------
3769 void RegexTest::Errors() {
3770     // \escape sequences that aren't implemented yet.
3771     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3772
3773     // Missing close parentheses
3774     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3775     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3776     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3777
3778     // Extra close paren
3779     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3780     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3781     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3782
3783     // Look-ahead, Look-behind
3784     //  TODO:  add tests for unbounded length look-behinds.
3785     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3786
3787     // Attempt to use non-default flags
3788     {
3789         UParseError   pe;
3790         UErrorCode    status = U_ZERO_ERROR;
3791         int32_t       flags  = UREGEX_CANON_EQ |
3792                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3793                                UREGEX_MULTILINE;
3794         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3795         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3796         delete pat1;
3797     }
3798
3799
3800     // Quantifiers are allowed only after something that can be quantified.
3801     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3802     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3803     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3804
3805     // Mal-formed {min,max} quantifiers
3806     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3807     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3808     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3809     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3810     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3811     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3812     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3813     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3814     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3815
3816     // Ticket 5389
3817     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3818
3819     // Invalid Back Reference \0
3820     //    For ICU 3.8 and earlier
3821     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3822     //
3823     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3824
3825 }
3826
3827
3828 //-------------------------------------------------------------------------------
3829 //
3830 //  Read a text data file, convert it to UChars, and return the data
3831 //    in one big UChar * buffer, which the caller must delete.
3832 //
3833 //--------------------------------------------------------------------------------
3834 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3835                                      const char *defEncoding, UErrorCode &status) {
3836     UChar       *retPtr  = NULL;
3837     char        *fileBuf = NULL;
3838     UConverter* conv     = NULL;
3839     FILE        *f       = NULL;
3840
3841     ulen = 0;
3842     if (U_FAILURE(status)) {
3843         return retPtr;
3844     }
3845
3846     //
3847     //  Open the file.
3848     //
3849     f = fopen(fileName, "rb");
3850     if (f == 0) {
3851         dataerrln("Error opening test data file %s\n", fileName);
3852         status = U_FILE_ACCESS_ERROR;
3853         return NULL;
3854     }
3855     //
3856     //  Read it in
3857     //
3858     int32_t            fileSize;
3859     int32_t            amt_read;
3860
3861     fseek( f, 0, SEEK_END);
3862     fileSize = ftell(f);
3863     fileBuf = new char[fileSize];
3864     fseek(f, 0, SEEK_SET);
3865     amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3866     if (amt_read != fileSize || fileSize <= 0) {
3867         errln("Error reading test data file.");
3868         goto cleanUpAndReturn;
3869     }
3870
3871     //
3872     // Look for a Unicode Signature (BOM) on the data just read
3873     //
3874     int32_t        signatureLength;
3875     const char *   fileBufC;
3876     const char*    encoding;
3877
3878     fileBufC = fileBuf;
3879     encoding = ucnv_detectUnicodeSignature(
3880         fileBuf, fileSize, &signatureLength, &status);
3881     if(encoding!=NULL ){
3882         fileBufC  += signatureLength;
3883         fileSize  -= signatureLength;
3884     } else {
3885         encoding = defEncoding;
3886         if (strcmp(encoding, "utf-8") == 0) {
3887             errln("file %s is missing its BOM", fileName);
3888         }
3889     }
3890
3891     //
3892     // Open a converter to take the rule file to UTF-16
3893     //
3894     conv = ucnv_open(encoding, &status);
3895     if (U_FAILURE(status)) {
3896         goto cleanUpAndReturn;
3897     }
3898
3899     //
3900     // Convert the rules to UChar.
3901     //  Preflight first to determine required buffer size.
3902     //
3903     ulen = ucnv_toUChars(conv,
3904         NULL,           //  dest,
3905         0,              //  destCapacity,
3906         fileBufC,
3907         fileSize,
3908         &status);
3909     if (status == U_BUFFER_OVERFLOW_ERROR) {
3910         // Buffer Overflow is expected from the preflight operation.
3911         status = U_ZERO_ERROR;
3912
3913         retPtr = new UChar[ulen+1];
3914         ucnv_toUChars(conv,
3915             retPtr,       //  dest,
3916             ulen+1,
3917             fileBufC,
3918             fileSize,
3919             &status);
3920     }
3921
3922 cleanUpAndReturn:
3923     fclose(f);
3924     delete[] fileBuf;
3925     ucnv_close(conv);
3926     if (U_FAILURE(status)) {
3927         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3928         delete []retPtr;
3929         retPtr = 0;
3930         ulen   = 0;
3931     };
3932     return retPtr;
3933 }
3934
3935
3936 //-------------------------------------------------------------------------------
3937 //
3938 //   PerlTests  - Run Perl's regular expression tests
3939 //                The input file for this test is re_tests, the standard regular
3940 //                expression test data distributed with the Perl source code.
3941 //
3942 //                Here is Perl's description of the test data file:
3943 //
3944 //        # The tests are in a separate file 't/op/re_tests'.
3945 //        # Each line in that file is a separate test.
3946 //        # There are five columns, separated by tabs.
3947 //        #
3948 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3949 //        # Modifiers can be put after the closing C<'>.
3950 //        #
3951 //        # Column 2 contains the string to be matched.
3952 //        #
3953 //        # Column 3 contains the expected result:
3954 //        #     y   expect a match
3955 //        #     n   expect no match
3956 //        #     c   expect an error
3957 //        # B   test exposes a known bug in Perl, should be skipped
3958 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3959 //        #
3960 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3961 //        #
3962 //        # Column 4 contains a string, usually C<$&>.
3963 //        #
3964 //        # Column 5 contains the expected result of double-quote
3965 //        # interpolating that string after the match, or start of error message.
3966 //        #
3967 //        # Column 6, if present, contains a reason why the test is skipped.
3968 //        # This is printed with "skipped", for harness to pick up.
3969 //        #
3970 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3971 //        #
3972 //        # If you want to add a regular expression test that can't be expressed
3973 //        # in this format, don't add it here: put it in op/pat.t instead.
3974 //
3975 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3976 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3977 //        (The i is in addition to whatever was there before.)
3978 //
3979 //-------------------------------------------------------------------------------
3980 void RegexTest::PerlTests() {
3981     char tdd[2048];
3982     const char *srcPath;
3983     UErrorCode  status = U_ZERO_ERROR;
3984     UParseError pe;
3985
3986     //
3987     //  Open and read the test data file.
3988     //
3989     srcPath=getPath(tdd, "re_tests.txt");
3990     if(srcPath==NULL) {
3991         return; /* something went wrong, error already output */
3992     }
3993
3994     int32_t    len;
3995     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3996     if (U_FAILURE(status)) {
3997         return; /* something went wrong, error already output */
3998     }
3999
4000     //
4001     //  Put the test data into a UnicodeString
4002     //
4003     UnicodeString testDataString(FALSE, testData, len);
4004
4005     //
4006     //  Regex to break the input file into lines, and strip the new lines.
4007     //     One line per match, capture group one is the desired data.
4008     //
4009     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4010     if (U_FAILURE(status)) {
4011         dataerrln("RegexPattern::compile() error");
4012         return;
4013     }
4014     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4015
4016     //
4017     //  Regex to split a test file line into fields.
4018     //    There are six fields, separated by tabs.
4019     //
4020     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4021
4022     //
4023     //  Regex to identify test patterns with flag settings, and to separate them.
4024     //    Test patterns with flags look like 'pattern'i
4025     //    Test patterns without flags are not quoted:   pattern
4026     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4027     //
4028     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4029     RegexMatcher* flagMat = flagPat->matcher(status);
4030
4031     //
4032     // The Perl tests reference several perl-isms, which are evaluated/substituted
4033     //   in the test data.  Not being perl, this must be done explicitly.  Here
4034     //   are string constants and REs for these constructs.
4035     //
4036     UnicodeString nulnulSrc("${nulnul}");
4037     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4038     nulnul = nulnul.unescape();
4039
4040     UnicodeString ffffSrc("${ffff}");
4041     UnicodeString ffff("\\uffff", -1, US_INV);
4042     ffff = ffff.unescape();
4043
4044     //  regexp for $-[0], $+[2], etc.
4045     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4046     RegexMatcher *groupsMat = groupsPat->matcher(status);
4047
4048     //  regexp for $0, $1, $2, etc.
4049     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4050     RegexMatcher *cgMat = cgPat->matcher(status);
4051
4052
4053     //
4054     // Main Loop for the Perl Tests, runs once per line from the
4055     //   test data file.
4056     //
4057     int32_t  lineNum = 0;
4058     int32_t  skippedUnimplementedCount = 0;
4059     while (lineMat->find()) {
4060         lineNum++;
4061
4062         //
4063         //  Get a line, break it into its fields, do the Perl
4064         //    variable substitutions.
4065         //
4066         UnicodeString line = lineMat->group(1, status);
4067         UnicodeString fields[7];
4068         fieldPat->split(line, fields, 7, status);
4069
4070         flagMat->reset(fields[0]);
4071         flagMat->matches(status);
4072         UnicodeString pattern  = flagMat->group(2, status);
4073         pattern.findAndReplace("${bang}", "!");
4074         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4075         pattern.findAndReplace(ffffSrc, ffff);
4076
4077         //
4078         //  Identify patterns that include match flag settings,
4079         //    split off the flags, remove the extra quotes.
4080         //
4081         UnicodeString flagStr = flagMat->group(3, status);
4082         if (U_FAILURE(status)) {
4083             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4084             return;
4085         }
4086         int32_t flags = 0;
4087         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4088         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4089         const UChar UChar_m = 0x6d;
4090         const UChar UChar_x = 0x78;
4091         const UChar UChar_y = 0x79;
4092         if (flagStr.indexOf(UChar_i) != -1) {
4093             flags |= UREGEX_CASE_INSENSITIVE;
4094         }
4095         if (flagStr.indexOf(UChar_m) != -1) {
4096             flags |= UREGEX_MULTILINE;
4097         }
4098         if (flagStr.indexOf(UChar_x) != -1) {
4099             flags |= UREGEX_COMMENTS;
4100         }
4101
4102         //
4103         // Compile the test pattern.
4104         //
4105         status = U_ZERO_ERROR;
4106         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4107         if (status == U_REGEX_UNIMPLEMENTED) {
4108             //
4109             // Test of a feature that is planned for ICU, but not yet implemented.
4110             //   skip the test.
4111             skippedUnimplementedCount++;
4112             delete testPat;
4113             status = U_ZERO_ERROR;
4114             continue;
4115         }
4116
4117         if (U_FAILURE(status)) {
4118             // Some tests are supposed to generate errors.
4119             //   Only report an error for tests that are supposed to succeed.
4120             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4121                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4122             {
4123                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4124             }
4125             status = U_ZERO_ERROR;
4126             delete testPat;
4127             continue;
4128         }
4129
4130         if (fields[2].indexOf(UChar_i) >= 0) {
4131             // ICU should skip this test.
4132             delete testPat;
4133             continue;
4134         }
4135
4136         if (fields[2].indexOf(UChar_c) >= 0) {
4137             // This pattern should have caused a compilation error, but didn't/
4138             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4139             delete testPat;
4140             continue;
4141         }
4142
4143         //
4144         // replace the Perl variables that appear in some of the
4145         //   match data strings.
4146         //
4147         UnicodeString matchString = fields[1];
4148         matchString.findAndReplace(nulnulSrc, nulnul);
4149         matchString.findAndReplace(ffffSrc,   ffff);
4150
4151         // Replace any \n in the match string with an actual new-line char.
4152         //  Don't do full unescape, as this unescapes more than Perl does, which
4153         //  causes other spurious failures in the tests.
4154         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4155
4156
4157
4158         //
4159         // Run the test, check for expected match/don't match result.
4160         //
4161         RegexMatcher *testMat = testPat->matcher(matchString, status);
4162         UBool found = testMat->find();
4163         UBool expected = FALSE;
4164         if (fields[2].indexOf(UChar_y) >=0) {
4165             expected = TRUE;
4166         }
4167         if (expected != found) {
4168             errln("line %d: Expected %smatch, got %smatch",
4169                 lineNum, expected?"":"no ", found?"":"no " );
4170             continue;
4171         }
4172
4173         // Don't try to check expected results if there is no match.
4174         //   (Some have stuff in the expected fields)
4175         if (!found) {
4176             delete testMat;
4177             delete testPat;
4178             continue;
4179         }
4180
4181         //
4182         // Interpret the Perl expression from the fourth field of the data file,
4183         // building up an ICU string from the results of the ICU match.
4184         //   The Perl expression will contain references to the results of
4185         //     a regex match, including the matched string, capture group strings,
4186         //     group starting and ending indicies, etc.
4187         //
4188         UnicodeString resultString;
4189         UnicodeString perlExpr = fields[3];
4190 #if SUPPORT_MUTATING_INPUT_STRING
4191         groupsMat->reset(perlExpr);
4192         cgMat->reset(perlExpr);
4193 #endif
4194
4195         while (perlExpr.length() > 0) {
4196 #if !SUPPORT_MUTATING_INPUT_STRING
4197             //  Perferred usage.  Reset after any modification to input string.
4198             groupsMat->reset(perlExpr);
4199             cgMat->reset(perlExpr);
4200 #endif
4201
4202             if (perlExpr.startsWith("$&")) {
4203                 resultString.append(testMat->group(status));
4204                 perlExpr.remove(0, 2);
4205             }
4206
4207             else if (groupsMat->lookingAt(status)) {
4208                 // $-[0]   $+[2]  etc.
4209                 UnicodeString digitString = groupsMat->group(2, status);
4210                 int32_t t = 0;
4211                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4212                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4213                 int32_t matchPosition;
4214                 if (plusOrMinus.compare("+") == 0) {
4215                     matchPosition = testMat->end(groupNum, status);
4216                 } else {
4217                     matchPosition = testMat->start(groupNum, status);
4218                 }
4219                 if (matchPosition != -1) {
4220                     ICU_Utility::appendNumber(resultString, matchPosition);
4221                 }
4222                 perlExpr.remove(0, groupsMat->end(status));
4223             }
4224
4225             else if (cgMat->lookingAt(status)) {
4226                 // $1, $2, $3, etc.
4227                 UnicodeString digitString = cgMat->group(1, status);
4228                 int32_t t = 0;
4229                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4230                 if (U_SUCCESS(status)) {
4231                     resultString.append(testMat->group(groupNum, status));
4232                     status = U_ZERO_ERROR;
4233                 }
4234                 perlExpr.remove(0, cgMat->end(status));
4235             }
4236
4237             else if (perlExpr.startsWith("@-")) {
4238                 int32_t i;
4239                 for (i=0; i<=testMat->groupCount(); i++) {
4240                     if (i>0) {
4241                         resultString.append(" ");
4242                     }
4243                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4244                 }
4245                 perlExpr.remove(0, 2);
4246             }
4247
4248             else if (perlExpr.startsWith("@+")) {
4249                 int32_t i;
4250                 for (i=0; i<=testMat->groupCount(); i++) {
4251                     if (i>0) {
4252                         resultString.append(" ");
4253                     }
4254                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4255                 }
4256                 perlExpr.remove(0, 2);
4257             }
4258
4259             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4260                                                      //           or as an escaped sequence (e.g. \n)
4261                 if (perlExpr.length() > 1) {
4262                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4263                 }
4264                 UChar c = perlExpr.charAt(0);
4265                 switch (c) {
4266                 case 'n':   c = '\n'; break;
4267                 // add any other escape sequences that show up in the test expected results.
4268                 }
4269                 resultString.append(c);
4270                 perlExpr.remove(0, 1);
4271             }
4272
4273             else  {
4274                 // Any characters from the perl expression that we don't explicitly
4275                 //  recognize before here are assumed to be literals and copied
4276                 //  as-is to the expected results.
4277                 resultString.append(perlExpr.charAt(0));
4278                 perlExpr.remove(0, 1);
4279             }
4280
4281             if (U_FAILURE(status)) {
4282                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4283                 break;
4284             }
4285         }
4286
4287         //
4288         // Expected Results Compare
4289         //
4290         UnicodeString expectedS(fields[4]);
4291         expectedS.findAndReplace(nulnulSrc, nulnul);
4292         expectedS.findAndReplace(ffffSrc,   ffff);
4293         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4294
4295
4296         if (expectedS.compare(resultString) != 0) {
4297             err("Line %d: Incorrect perl expression results.", lineNum);
4298             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4299         }
4300
4301         delete testMat;
4302         delete testPat;
4303     }
4304
4305     //
4306     // All done.  Clean up allocated stuff.
4307     //
4308     delete cgMat;
4309     delete cgPat;
4310
4311     delete groupsMat;
4312     delete groupsPat;
4313
4314     delete flagMat;
4315     delete flagPat;
4316
4317     delete lineMat;
4318     delete linePat;
4319
4320     delete fieldPat;
4321     delete [] testData;
4322
4323
4324     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4325
4326 }
4327
4328
4329 //-------------------------------------------------------------------------------
4330 //
4331 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4332 //                  (instead of using UnicodeStrings) to test the alternate engine.
4333 //                  The input file for this test is re_tests, the standard regular
4334 //                  expression test data distributed with the Perl source code.
4335 //                  See PerlTests() for more information.
4336 //
4337 //-------------------------------------------------------------------------------
4338 void RegexTest::PerlTestsUTF8() {
4339     char tdd[2048];
4340     const char *srcPath;
4341     UErrorCode  status = U_ZERO_ERROR;
4342     UParseError pe;
4343     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4344     UText       patternText = UTEXT_INITIALIZER;
4345     char       *patternChars = NULL;
4346     int32_t     patternLength;
4347     int32_t     patternCapacity = 0;
4348     UText       inputText = UTEXT_INITIALIZER;
4349     char       *inputChars = NULL;
4350     int32_t     inputLength;
4351     int32_t     inputCapacity = 0;
4352
4353     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4354
4355     //
4356     //  Open and read the test data file.
4357     //
4358     srcPath=getPath(tdd, "re_tests.txt");
4359     if(srcPath==NULL) {
4360         return; /* something went wrong, error already output */
4361     }
4362
4363     int32_t    len;
4364     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4365     if (U_FAILURE(status)) {
4366         return; /* something went wrong, error already output */
4367     }
4368
4369     //
4370     //  Put the test data into a UnicodeString
4371     //
4372     UnicodeString testDataString(FALSE, testData, len);
4373
4374     //
4375     //  Regex to break the input file into lines, and strip the new lines.
4376     //     One line per match, capture group one is the desired data.
4377     //
4378     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4379     if (U_FAILURE(status)) {
4380         dataerrln("RegexPattern::compile() error");
4381         return;
4382     }
4383     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4384
4385     //
4386     //  Regex to split a test file line into fields.
4387     //    There are six fields, separated by tabs.
4388     //
4389     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4390
4391     //
4392     //  Regex to identify test patterns with flag settings, and to separate them.
4393     //    Test patterns with flags look like 'pattern'i
4394     //    Test patterns without flags are not quoted:   pattern
4395     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4396     //
4397     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4398     RegexMatcher* flagMat = flagPat->matcher(status);
4399
4400     //
4401     // The Perl tests reference several perl-isms, which are evaluated/substituted
4402     //   in the test data.  Not being perl, this must be done explicitly.  Here
4403     //   are string constants and REs for these constructs.
4404     //
4405     UnicodeString nulnulSrc("${nulnul}");
4406     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4407     nulnul = nulnul.unescape();
4408
4409     UnicodeString ffffSrc("${ffff}");
4410     UnicodeString ffff("\\uffff", -1, US_INV);
4411     ffff = ffff.unescape();
4412
4413     //  regexp for $-[0], $+[2], etc.
4414     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4415     RegexMatcher *groupsMat = groupsPat->matcher(status);
4416
4417     //  regexp for $0, $1, $2, etc.
4418     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4419     RegexMatcher *cgMat = cgPat->matcher(status);
4420
4421
4422     //
4423     // Main Loop for the Perl Tests, runs once per line from the
4424     //   test data file.
4425     //
4426     int32_t  lineNum = 0;
4427     int32_t  skippedUnimplementedCount = 0;
4428     while (lineMat->find()) {
4429         lineNum++;
4430
4431         //
4432         //  Get a line, break it into its fields, do the Perl
4433         //    variable substitutions.
4434         //
4435         UnicodeString line = lineMat->group(1, status);
4436         UnicodeString fields[7];
4437         fieldPat->split(line, fields, 7, status);
4438
4439         flagMat->reset(fields[0]);
4440         flagMat->matches(status);
4441         UnicodeString pattern  = flagMat->group(2, status);
4442         pattern.findAndReplace("${bang}", "!");
4443         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4444         pattern.findAndReplace(ffffSrc, ffff);
4445
4446         //
4447         //  Identify patterns that include match flag settings,
4448         //    split off the flags, remove the extra quotes.
4449         //
4450         UnicodeString flagStr = flagMat->group(3, status);
4451         if (U_FAILURE(status)) {
4452             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4453             return;
4454         }
4455         int32_t flags = 0;
4456         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4457         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4458         const UChar UChar_m = 0x6d;
4459         const UChar UChar_x = 0x78;
4460         const UChar UChar_y = 0x79;
4461         if (flagStr.indexOf(UChar_i) != -1) {
4462             flags |= UREGEX_CASE_INSENSITIVE;
4463         }
4464         if (flagStr.indexOf(UChar_m) != -1) {
4465             flags |= UREGEX_MULTILINE;
4466         }
4467         if (flagStr.indexOf(UChar_x) != -1) {
4468             flags |= UREGEX_COMMENTS;
4469         }
4470
4471         //
4472         // Put the pattern in a UTF-8 UText
4473         //
4474         status = U_ZERO_ERROR;
4475         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4476         if (status == U_BUFFER_OVERFLOW_ERROR) {
4477             status = U_ZERO_ERROR;
4478             delete[] patternChars;
4479             patternCapacity = patternLength + 1;
4480             patternChars = new char[patternCapacity];
4481             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4482         }
4483         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4484
4485         //
4486         // Compile the test pattern.
4487         //
4488         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4489         if (status == U_REGEX_UNIMPLEMENTED) {
4490             //
4491             // Test of a feature that is planned for ICU, but not yet implemented.
4492             //   skip the test.
4493             skippedUnimplementedCount++;
4494             delete testPat;
4495             status = U_ZERO_ERROR;
4496             continue;
4497         }
4498
4499         if (U_FAILURE(status)) {
4500             // Some tests are supposed to generate errors.
4501             //   Only report an error for tests that are supposed to succeed.
4502             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4503                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4504             {
4505                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4506             }
4507             status = U_ZERO_ERROR;
4508             delete testPat;
4509             continue;
4510         }
4511
4512         if (fields[2].indexOf(UChar_i) >= 0) {
4513             // ICU should skip this test.
4514             delete testPat;
4515             continue;
4516         }
4517
4518         if (fields[2].indexOf(UChar_c) >= 0) {
4519             // This pattern should have caused a compilation error, but didn't/
4520             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4521             delete testPat;
4522             continue;
4523         }
4524
4525
4526         //
4527         // replace the Perl variables that appear in some of the
4528         //   match data strings.
4529         //
4530         UnicodeString matchString = fields[1];
4531         matchString.findAndReplace(nulnulSrc, nulnul);
4532         matchString.findAndReplace(ffffSrc,   ffff);
4533
4534         // Replace any \n in the match string with an actual new-line char.
4535         //  Don't do full unescape, as this unescapes more than Perl does, which
4536         //  causes other spurious failures in the tests.
4537         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4538
4539         //
4540         // Put the input in a UTF-8 UText
4541         //
4542         status = U_ZERO_ERROR;
4543         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4544         if (status == U_BUFFER_OVERFLOW_ERROR) {
4545             status = U_ZERO_ERROR;
4546             delete[] inputChars;
4547             inputCapacity = inputLength + 1;
4548             inputChars = new char[inputCapacity];
4549             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4550         }
4551         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4552
4553         //
4554         // Run the test, check for expected match/don't match result.
4555         //
4556         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4557         UBool found = testMat->find();
4558         UBool expected = FALSE;
4559         if (fields[2].indexOf(UChar_y) >=0) {
4560             expected = TRUE;
4561         }
4562         if (expected != found) {
4563             errln("line %d: Expected %smatch, got %smatch",
4564                 lineNum, expected?"":"no ", found?"":"no " );
4565             continue;
4566         }
4567
4568         // Don't try to check expected results if there is no match.
4569         //   (Some have stuff in the expected fields)
4570         if (!found) {
4571             delete testMat;
4572             delete testPat;
4573             continue;
4574         }
4575
4576         //
4577         // Interpret the Perl expression from the fourth field of the data file,
4578         // building up an ICU string from the results of the ICU match.
4579         //   The Perl expression will contain references to the results of
4580         //     a regex match, including the matched string, capture group strings,
4581         //     group starting and ending indicies, etc.
4582         //
4583         UnicodeString resultString;
4584         UnicodeString perlExpr = fields[3];
4585
4586         while (perlExpr.length() > 0) {
4587             groupsMat->reset(perlExpr);
4588             cgMat->reset(perlExpr);
4589
4590             if (perlExpr.startsWith("$&")) {
4591                 resultString.append(testMat->group(status));
4592                 perlExpr.remove(0, 2);
4593             }
4594
4595             else if (groupsMat->lookingAt(status)) {
4596                 // $-[0]   $+[2]  etc.
4597                 UnicodeString digitString = groupsMat->group(2, status);
4598                 int32_t t = 0;
4599                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4600                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4601                 int32_t matchPosition;
4602                 if (plusOrMinus.compare("+") == 0) {
4603                     matchPosition = testMat->end(groupNum, status);
4604                 } else {
4605                     matchPosition = testMat->start(groupNum, status);
4606                 }
4607                 if (matchPosition != -1) {
4608                     ICU_Utility::appendNumber(resultString, matchPosition);
4609                 }
4610                 perlExpr.remove(0, groupsMat->end(status));
4611             }
4612
4613             else if (cgMat->lookingAt(status)) {
4614                 // $1, $2, $3, etc.
4615                 UnicodeString digitString = cgMat->group(1, status);
4616                 int32_t t = 0;
4617                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4618                 if (U_SUCCESS(status)) {
4619                     resultString.append(testMat->group(groupNum, status));
4620                     status = U_ZERO_ERROR;
4621                 }
4622                 perlExpr.remove(0, cgMat->end(status));
4623             }
4624
4625             else if (perlExpr.startsWith("@-")) {
4626                 int32_t i;
4627                 for (i=0; i<=testMat->groupCount(); i++) {
4628                     if (i>0) {
4629                         resultString.append(" ");
4630                     }
4631                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4632                 }
4633                 perlExpr.remove(0, 2);
4634             }
4635
4636             else if (perlExpr.startsWith("@+")) {
4637                 int32_t i;
4638                 for (i=0; i<=testMat->groupCount(); i++) {
4639                     if (i>0) {
4640                         resultString.append(" ");
4641                     }
4642                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4643                 }
4644                 perlExpr.remove(0, 2);
4645             }
4646
4647             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4648                                                      //           or as an escaped sequence (e.g. \n)
4649                 if (perlExpr.length() > 1) {
4650                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4651                 }
4652                 UChar c = perlExpr.charAt(0);
4653                 switch (c) {
4654                 case 'n':   c = '\n'; break;
4655                 // add any other escape sequences that show up in the test expected results.
4656                 }
4657                 resultString.append(c);
4658                 perlExpr.remove(0, 1);
4659             }
4660
4661             else  {
4662                 // Any characters from the perl expression that we don't explicitly
4663                 //  recognize before here are assumed to be literals and copied
4664                 //  as-is to the expected results.
4665                 resultString.append(perlExpr.charAt(0));
4666                 perlExpr.remove(0, 1);
4667             }
4668
4669             if (U_FAILURE(status)) {
4670                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4671                 break;
4672             }
4673         }
4674
4675         //
4676         // Expected Results Compare
4677         //
4678         UnicodeString expectedS(fields[4]);
4679         expectedS.findAndReplace(nulnulSrc, nulnul);
4680         expectedS.findAndReplace(ffffSrc,   ffff);
4681         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4682
4683
4684         if (expectedS.compare(resultString) != 0) {
4685             err("Line %d: Incorrect perl expression results.", lineNum);
4686             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4687         }
4688
4689         delete testMat;
4690         delete testPat;
4691     }
4692
4693     //
4694     // All done.  Clean up allocated stuff.
4695     //
4696     delete cgMat;
4697     delete cgPat;
4698
4699     delete groupsMat;
4700     delete groupsPat;
4701
4702     delete flagMat;
4703     delete flagPat;
4704
4705     delete lineMat;
4706     delete linePat;
4707
4708     delete fieldPat;
4709     delete [] testData;
4710
4711     utext_close(&patternText);
4712     utext_close(&inputText);
4713
4714     delete [] patternChars;
4715     delete [] inputChars;
4716
4717
4718     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4719
4720 }
4721
4722
4723 //--------------------------------------------------------------
4724 //
4725 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4726 //             Use this pattern,
4727 //                 "(a?){1,8000000}"
4728 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4729 //                   This test is likely to be fragile, as further optimizations stop
4730 //                   more cases of pointless looping in the match engine.
4731 //
4732 //---------------------------------------------------------------
4733 void RegexTest::Bug6149() {
4734     UnicodeString pattern("(a?){1,8000000}");
4735     UnicodeString s("xyz");
4736     uint32_t flags = 0;
4737     UErrorCode status = U_ZERO_ERROR;
4738
4739     RegexMatcher  matcher(pattern, s, flags, status);
4740     UBool result = false;
4741     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4742     REGEX_ASSERT(result == FALSE);
4743  }
4744
4745
4746 //
4747 //   Callbacks()    Test the callback function.
4748 //                  When set, callbacks occur periodically during matching operations,
4749 //                  giving the application code the ability to abort the operation
4750 //                  before it's normal completion.
4751 //
4752
4753 struct callBackContext {
4754     RegexTest        *test;
4755     int32_t          maxCalls;
4756     int32_t          numCalls;
4757     int32_t          lastSteps;
4758     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4759 };
4760
4761 U_CDECL_BEGIN
4762 static UBool U_CALLCONV
4763 testCallBackFn(const void *context, int32_t steps) {
4764     callBackContext  *info = (callBackContext *)context;
4765     if (info->lastSteps+1 != steps) {
4766         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4767     }
4768     info->lastSteps = steps;
4769     info->numCalls++;
4770     return (info->numCalls < info->maxCalls);
4771 }
4772 U_CDECL_END
4773
4774 void RegexTest::Callbacks() {
4775    {
4776         // Getter returns NULLs if no callback has been set
4777
4778         //   The variables that the getter will fill in.
4779         //   Init to non-null values so that the action of the getter can be seen.
4780         const void          *returnedContext = &returnedContext;
4781         URegexMatchCallback *returnedFn = &testCallBackFn;
4782
4783         UErrorCode status = U_ZERO_ERROR;
4784         RegexMatcher matcher("x", 0, status);
4785         REGEX_CHECK_STATUS;
4786         matcher.getMatchCallback(returnedFn, returnedContext, status);
4787         REGEX_CHECK_STATUS;
4788         REGEX_ASSERT(returnedFn == NULL);
4789         REGEX_ASSERT(returnedContext == NULL);
4790     }
4791
4792    {
4793         // Set and Get work
4794         callBackContext cbInfo = {this, 0, 0, 0};
4795         const void          *returnedContext;
4796         URegexMatchCallback *returnedFn;
4797         UErrorCode status = U_ZERO_ERROR;
4798         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4799         REGEX_CHECK_STATUS;
4800         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4801         REGEX_CHECK_STATUS;
4802         matcher.getMatchCallback(returnedFn, returnedContext, status);
4803         REGEX_CHECK_STATUS;
4804         REGEX_ASSERT(returnedFn == testCallBackFn);
4805         REGEX_ASSERT(returnedContext == &cbInfo);
4806
4807         // A short-running match shouldn't invoke the callback
4808         status = U_ZERO_ERROR;
4809         cbInfo.reset(1);
4810         UnicodeString s = "xxx";
4811         matcher.reset(s);
4812         REGEX_ASSERT(matcher.matches(status));
4813         REGEX_CHECK_STATUS;
4814         REGEX_ASSERT(cbInfo.numCalls == 0);
4815
4816         // A medium-length match that runs long enough to invoke the
4817         //   callback, but not so long that the callback aborts it.
4818         status = U_ZERO_ERROR;
4819         cbInfo.reset(4);
4820         s = "aaaaaaaaaaaaaaaaaaab";
4821         matcher.reset(s);
4822         REGEX_ASSERT(matcher.matches(status)==FALSE);
4823         REGEX_CHECK_STATUS;
4824         REGEX_ASSERT(cbInfo.numCalls > 0);
4825
4826         // A longer running match that the callback function will abort.
4827         status = U_ZERO_ERROR;
4828         cbInfo.reset(4);
4829         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4830         matcher.reset(s);
4831         REGEX_ASSERT(matcher.matches(status)==FALSE);
4832         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4833         REGEX_ASSERT(cbInfo.numCalls == 4);
4834
4835         // A longer running find that the callback function will abort.
4836         status = U_ZERO_ERROR;
4837         cbInfo.reset(4);
4838         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4839         matcher.reset(s);
4840         REGEX_ASSERT(matcher.find(status)==FALSE);
4841         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4842         REGEX_ASSERT(cbInfo.numCalls == 4);
4843     }
4844
4845
4846 }
4847
4848
4849 //
4850 //   FindProgressCallbacks()    Test the find "progress" callback function.
4851 //                  When set, the find progress callback will be invoked during a find operations
4852 //                  after each return from a match attempt, giving the application the opportunity
4853 //                  to terminate a long-running find operation before it's normal completion.
4854 //
4855
4856 struct progressCallBackContext {
4857     RegexTest        *test;
4858     int64_t          lastIndex;
4859     int32_t          maxCalls;
4860     int32_t          numCalls;
4861     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4862 };
4863
4864 // call-back function for find().
4865 // Return TRUE to continue the find().
4866 // Return FALSE to stop the find().
4867 U_CDECL_BEGIN
4868 static UBool U_CALLCONV
4869 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4870     progressCallBackContext  *info = (progressCallBackContext *)context;
4871     info->numCalls++;
4872     info->lastIndex = matchIndex;
4873 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4874     return (info->numCalls < info->maxCalls);
4875 }
4876 U_CDECL_END
4877
4878 void RegexTest::FindProgressCallbacks() {
4879    {
4880         // Getter returns NULLs if no callback has been set
4881
4882         //   The variables that the getter will fill in.
4883         //   Init to non-null values so that the action of the getter can be seen.
4884         const void                  *returnedContext = &returnedContext;
4885         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4886
4887         UErrorCode status = U_ZERO_ERROR;
4888         RegexMatcher matcher("x", 0, status);
4889         REGEX_CHECK_STATUS;
4890         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4891         REGEX_CHECK_STATUS;
4892         REGEX_ASSERT(returnedFn == NULL);
4893         REGEX_ASSERT(returnedContext == NULL);
4894     }
4895
4896    {
4897         // Set and Get work
4898         progressCallBackContext cbInfo = {this, 0, 0, 0};
4899         const void                  *returnedContext;
4900         URegexFindProgressCallback  *returnedFn;
4901         UErrorCode status = U_ZERO_ERROR;
4902         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4903         REGEX_CHECK_STATUS;
4904         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4905         REGEX_CHECK_STATUS;
4906         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4907         REGEX_CHECK_STATUS;
4908         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4909         REGEX_ASSERT(returnedContext == &cbInfo);
4910
4911         // A find that matches on the initial position does NOT invoke the callback.
4912         status = U_ZERO_ERROR;
4913         cbInfo.reset(100);
4914         UnicodeString s = "aaxxx";
4915         matcher.reset(s);
4916 #if 0
4917         matcher.setTrace(TRUE);
4918 #endif
4919         REGEX_ASSERT(matcher.find(0, status));
4920         REGEX_CHECK_STATUS;
4921         REGEX_ASSERT(cbInfo.numCalls == 0);
4922
4923         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4924         //   but not so many times that we interrupt the operation.
4925         status = U_ZERO_ERROR;
4926         s = "aaaaaaaaaaaaaaaaaaab";
4927         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4928         matcher.reset(s);
4929         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4930         REGEX_CHECK_STATUS;
4931         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4932
4933         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4934         status = U_ZERO_ERROR;
4935         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4936         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4937         matcher.reset(s1);
4938         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4939         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4940         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4941
4942         // Now a match that will succeed, but after an interruption
4943         status = U_ZERO_ERROR;
4944         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4945         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4946         matcher.reset(s2);
4947         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4948         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4949         // Now retry the match from where left off
4950         cbInfo.maxCalls = 100; //  No callback limit
4951         status = U_ZERO_ERROR;
4952         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4953         REGEX_CHECK_STATUS;
4954     }
4955
4956
4957 }
4958
4959
4960 //---------------------------------------------------------------------------
4961 //
4962 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4963 //                             UTexts. The pure-C implementation of UText
4964 //                             has no mutable backing stores, but we can
4965 //                             use UnicodeString here to test the functionality.
4966 //
4967 //---------------------------------------------------------------------------
4968 void RegexTest::PreAllocatedUTextCAPI () {
4969     UErrorCode           status = U_ZERO_ERROR;
4970     URegularExpression  *re;
4971     UText                patternText = UTEXT_INITIALIZER;
4972     UnicodeString        buffer;
4973     UText                bufferText = UTEXT_INITIALIZER;
4974
4975     utext_openUnicodeString(&bufferText, &buffer, &status);
4976
4977     /*
4978      *  getText() and getUText()
4979      */
4980     {
4981         UText  text1 = UTEXT_INITIALIZER;
4982         UText  text2 = UTEXT_INITIALIZER;
4983         UChar  text2Chars[20];
4984         UText  *resultText;
4985
4986         status = U_ZERO_ERROR;
4987         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4988         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4989         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4990         utext_openUChars(&text2, text2Chars, -1, &status);
4991
4992         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4993         re = uregex_openUText(&patternText, 0, NULL, &status);
4994
4995         /* First set a UText */
4996         uregex_setUText(re, &text1, &status);
4997         resultText = uregex_getUText(re, &bufferText, &status);
4998         REGEX_CHECK_STATUS;
4999         REGEX_ASSERT(resultText == &bufferText);
5000         utext_setNativeIndex(resultText, 0);
5001         utext_setNativeIndex(&text1, 0);
5002         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5003
5004         resultText = uregex_getUText(re, &bufferText, &status);
5005         REGEX_CHECK_STATUS;
5006         REGEX_ASSERT(resultText == &bufferText);
5007         utext_setNativeIndex(resultText, 0);
5008         utext_setNativeIndex(&text1, 0);
5009         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5010
5011         /* Then set a UChar * */
5012         uregex_setText(re, text2Chars, 7, &status);
5013         resultText = uregex_getUText(re, &bufferText, &status);
5014         REGEX_CHECK_STATUS;
5015         REGEX_ASSERT(resultText == &bufferText);
5016         utext_setNativeIndex(resultText, 0);
5017         utext_setNativeIndex(&text2, 0);
5018         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5019
5020         uregex_close(re);
5021         utext_close(&text1);
5022         utext_close(&text2);
5023     }
5024
5025     /*
5026      *  group()
5027      */
5028     {
5029         UChar    text1[80];
5030         UText   *actual;
5031         UBool    result;
5032         int64_t  length = 0;
5033
5034         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5035         //                  012345678901234567890123456789012345678901234567
5036         //                  0         1         2         3         4
5037
5038         status = U_ZERO_ERROR;
5039         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5040         REGEX_CHECK_STATUS;
5041
5042         uregex_setText(re, text1, -1, &status);
5043         result = uregex_find(re, 0, &status);
5044         REGEX_ASSERT(result==TRUE);
5045
5046         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5047         status = U_ZERO_ERROR;
5048         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5049         REGEX_CHECK_STATUS;
5050         REGEX_ASSERT(actual == &bufferText);
5051         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5052         REGEX_ASSERT(length == 16);
5053         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5054
5055         /*  Capture group #1.  Should succeed, matching " interior ". */
5056         status = U_ZERO_ERROR;
5057         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5058         REGEX_CHECK_STATUS;
5059         REGEX_ASSERT(actual == &bufferText);
5060         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5061         REGEX_ASSERT(length == 10);
5062         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5063
5064         /*  Capture group out of range.  Error. */
5065         status = U_ZERO_ERROR;
5066         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5067         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5068         REGEX_ASSERT(actual == &bufferText);
5069         uregex_close(re);
5070
5071     }
5072
5073     /*
5074      *  replaceFirst()
5075      */
5076     {
5077         UChar    text1[80];
5078         UChar    text2[80];
5079         UText    replText = UTEXT_INITIALIZER;
5080         UText   *result;
5081         status = U_ZERO_ERROR;
5082         utext_openUnicodeString(&bufferText, &buffer, &status);
5083
5084         status = U_ZERO_ERROR;
5085         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5086         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5087         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5088
5089         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5090         REGEX_CHECK_STATUS;
5091
5092         /*  Normal case, with match */
5093         uregex_setText(re, text1, -1, &status);
5094         REGEX_CHECK_STATUS;
5095         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5096         REGEX_CHECK_STATUS;
5097         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5098         REGEX_CHECK_STATUS;
5099         REGEX_ASSERT(result == &bufferText);
5100         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5101
5102         /* No match.  Text should copy to output with no changes.  */
5103         uregex_setText(re, text2, -1, &status);
5104         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5105         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5106         REGEX_CHECK_STATUS;
5107         REGEX_ASSERT(result == &bufferText);
5108         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5109
5110         /* Unicode escapes */
5111         uregex_setText(re, text1, -1, &status);
5112         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5113         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5114         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5115         REGEX_CHECK_STATUS;
5116         REGEX_ASSERT(result == &bufferText);
5117         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5118
5119         uregex_close(re);
5120         utext_close(&replText);
5121     }
5122
5123
5124     /*
5125      *  replaceAll()
5126      */
5127     {
5128         UChar    text1[80];
5129         UChar    text2[80];
5130         UText    replText = UTEXT_INITIALIZER;
5131         UText   *result;
5132
5133         status = U_ZERO_ERROR;
5134         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5135         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5136         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5137
5138         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5139         REGEX_CHECK_STATUS;
5140
5141         /*  Normal case, with match */
5142         uregex_setText(re, text1, -1, &status);
5143         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5144         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5145         REGEX_CHECK_STATUS;
5146         REGEX_ASSERT(result == &bufferText);
5147         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5148
5149         /* No match.  Text should copy to output with no changes.  */
5150         uregex_setText(re, text2, -1, &status);
5151         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5152         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5153         REGEX_CHECK_STATUS;
5154         REGEX_ASSERT(result == &bufferText);
5155         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5156
5157         uregex_close(re);
5158         utext_close(&replText);
5159     }
5160
5161
5162     /*
5163      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5164      *   so we don't need to test it here.
5165      */
5166
5167     utext_close(&bufferText);
5168     utext_close(&patternText);
5169 }
5170
5171
5172 //--------------------------------------------------------------
5173 //
5174 //  NamedCapture   Check basic named capture group functionality
5175 //
5176 //--------------------------------------------------------------
5177 void RegexTest::NamedCapture() {
5178     UErrorCode status = U_ZERO_ERROR;
5179     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5180             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5181     REGEX_CHECK_STATUS;
5182     int32_t group = pat->groupNumberFromName("five", -1, status);
5183     REGEX_CHECK_STATUS;
5184     REGEX_ASSERT(5 == group);
5185     group = pat->groupNumberFromName("three", -1, status);
5186     REGEX_CHECK_STATUS;
5187     REGEX_ASSERT(3 == group);
5188
5189     status = U_ZERO_ERROR;
5190     group = pat->groupNumberFromName(UnicodeString("six"), status);
5191     REGEX_CHECK_STATUS;
5192     REGEX_ASSERT(6 == group);
5193
5194     status = U_ZERO_ERROR;
5195     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5196     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5197
5198     status = U_ZERO_ERROR;
5199
5200     // After copying a pattern, named capture should still work in the copy.
5201     RegexPattern *copiedPat = new RegexPattern(*pat);
5202     REGEX_ASSERT(*copiedPat == *pat);
5203     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5204
5205     group = copiedPat->groupNumberFromName("five", -1, status);
5206     REGEX_CHECK_STATUS;
5207     REGEX_ASSERT(5 == group);
5208     group = copiedPat->groupNumberFromName("three", -1, status);
5209     REGEX_CHECK_STATUS;
5210     REGEX_ASSERT(3 == group);
5211     delete copiedPat;
5212
5213     // ReplaceAll with named capture group.
5214     status = U_ZERO_ERROR;
5215     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5216     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5217     REGEX_CHECK_STATUS;
5218     // m.pattern().dumpPattern();
5219     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5220     REGEX_CHECK_STATUS;
5221     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5222     delete m;
5223
5224     // ReplaceAll, allowed capture group numbers.
5225     text = UnicodeString("abcmxyz");
5226     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5227     REGEX_CHECK_STATUS;
5228
5229     status = U_ZERO_ERROR;
5230     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5231     REGEX_CHECK_STATUS;
5232     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5233
5234     status = U_ZERO_ERROR;
5235     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5236     REGEX_CHECK_STATUS;
5237     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5238
5239     status = U_ZERO_ERROR;
5240     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5241     REGEX_CHECK_STATUS;
5242     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5243
5244     status = U_ZERO_ERROR;
5245     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5246     REGEX_CHECK_STATUS;
5247     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5248
5249     status = U_ZERO_ERROR;
5250     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5251     REGEX_CHECK_STATUS;
5252     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5253
5254     status = U_ZERO_ERROR;
5255     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5256     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5257
5258     status = U_ZERO_ERROR;
5259     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5260     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5261     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5262
5263     status = U_ZERO_ERROR;
5264     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5265     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5266     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5267
5268     status = U_ZERO_ERROR;
5269     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5270     REGEX_CHECK_STATUS;
5271     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5272
5273     status = U_ZERO_ERROR;
5274     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5275     REGEX_CHECK_STATUS;
5276     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5277
5278     status = U_ZERO_ERROR;
5279     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5280     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5281
5282     status = U_ZERO_ERROR;
5283     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5284     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5285
5286     status = U_ZERO_ERROR;
5287     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5288     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5289
5290     status = U_ZERO_ERROR;
5291     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5292     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5293
5294     delete m;
5295
5296     // Repeat the above replaceAll() tests using the plain C API, which
5297     //  has a separate implementation internally.
5298     //  TODO: factor out the test data.
5299
5300     status = U_ZERO_ERROR;
5301     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5302     REGEX_CHECK_STATUS;
5303     text = UnicodeString("abcmxyz");
5304     uregex_setText(re, text.getBuffer(), text.length(), &status);
5305     REGEX_CHECK_STATUS;
5306
5307     UChar resultBuf[100];
5308     int32_t resultLength;
5309     UnicodeString repl;
5310
5311     status = U_ZERO_ERROR;
5312     repl = UnicodeString("<$0>");
5313     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5314     REGEX_CHECK_STATUS;
5315     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5316
5317     status = U_ZERO_ERROR;
5318     repl = UnicodeString("<$1>");
5319     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5320     REGEX_CHECK_STATUS;
5321     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5322
5323     status = U_ZERO_ERROR;
5324     repl = UnicodeString("<${one}>");
5325     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5326     REGEX_CHECK_STATUS;
5327     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5328
5329     status = U_ZERO_ERROR;
5330     repl = UnicodeString("<$2>");
5331     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5332     REGEX_CHECK_STATUS;
5333     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5334
5335     status = U_ZERO_ERROR;
5336     repl = UnicodeString("<$3>");
5337     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5338     REGEX_CHECK_STATUS;
5339     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5340
5341     status = U_ZERO_ERROR;
5342     repl = UnicodeString("<$4>");
5343     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5344     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5345
5346     status = U_ZERO_ERROR;
5347     repl = UnicodeString("<$04>");
5348     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5349     REGEX_CHECK_STATUS;
5350     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5351
5352     status = U_ZERO_ERROR;
5353     repl = UnicodeString("<$000016>");
5354     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5355     REGEX_CHECK_STATUS;
5356     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5357
5358     status = U_ZERO_ERROR;
5359     repl = UnicodeString("<$3$2$1${one}>");
5360     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5361     REGEX_CHECK_STATUS;
5362     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5363
5364     status = U_ZERO_ERROR;
5365     repl = UnicodeString("$3$2$1${one}");
5366     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5367     REGEX_CHECK_STATUS;
5368     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5369
5370     status = U_ZERO_ERROR;
5371     repl = UnicodeString("<${noSuchName}>");
5372     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5373     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5374
5375     status = U_ZERO_ERROR;
5376     repl = UnicodeString("<${invalid-name}>");
5377     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5378     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5379
5380     status = U_ZERO_ERROR;
5381     repl = UnicodeString("<${one");
5382     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5384
5385     status = U_ZERO_ERROR;
5386     repl = UnicodeString("$not a capture group");
5387     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5388     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5389
5390     uregex_close(re);
5391 }
5392
5393 //--------------------------------------------------------------
5394 //
5395 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5396 //                       The point is not so much what the exact limit is,
5397 //                       but that a largish number doesn't hit bad non-linear performance,
5398 //                       and that exceeding the limit fails cleanly.
5399 //
5400 //--------------------------------------------------------------
5401 void RegexTest::NamedCaptureLimits() {
5402     if (quick) {
5403         logln("Skipping test. Runs in exhuastive mode only.");
5404         return;
5405     }
5406     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5407     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5408     char nnbuf[100];
5409     UnicodeString pattern;
5410     int32_t nn;
5411
5412     for (nn=1; nn<goodLimit; nn++) {
5413         sprintf(nnbuf, "(?<nn%d>)", nn);
5414         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5415     }
5416     UErrorCode status = U_ZERO_ERROR;
5417     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5418     REGEX_CHECK_STATUS;
5419     for (nn=1; nn<goodLimit; nn++) {
5420         sprintf(nnbuf, "nn%d", nn);
5421         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5422         REGEX_ASSERT(nn == groupNum);
5423         if (nn != groupNum) {
5424             break;
5425         }
5426     }
5427     delete pat;
5428
5429     pattern.remove();
5430     for (nn=1; nn<failLimit; nn++) {
5431         sprintf(nnbuf, "(?<nn%d>)", nn);
5432         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5433     }
5434     status = U_ZERO_ERROR;
5435     pat = RegexPattern::compile(pattern, 0, status);
5436     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5437     delete pat;
5438 }
5439
5440
5441 //--------------------------------------------------------------
5442 //
5443 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5444 //
5445 //---------------------------------------------------------------
5446 void RegexTest::Bug7651() {
5447     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5448     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5449     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5450     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5451     UnicodeString s("#ff @abcd This is test");
5452     RegexPattern  *REPattern = NULL;
5453     RegexMatcher  *REMatcher = NULL;
5454     UErrorCode status = U_ZERO_ERROR;
5455     UParseError pe;
5456
5457     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5458     REGEX_CHECK_STATUS;
5459     REMatcher = REPattern->matcher(s, status);
5460     REGEX_CHECK_STATUS;
5461     REGEX_ASSERT(REMatcher->find());
5462     REGEX_ASSERT(REMatcher->start(status) == 0);
5463     delete REPattern;
5464     delete REMatcher;
5465     status = U_ZERO_ERROR;
5466
5467     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5468     REGEX_CHECK_STATUS;
5469     REMatcher = REPattern->matcher(s, status);
5470     REGEX_CHECK_STATUS;
5471     REGEX_ASSERT(REMatcher->find());
5472     REGEX_ASSERT(REMatcher->start(status) == 0);
5473     delete REPattern;
5474     delete REMatcher;
5475     status = U_ZERO_ERROR;
5476  }
5477
5478 void RegexTest::Bug7740() {
5479     UErrorCode status = U_ZERO_ERROR;
5480     UnicodeString pattern = "(a)";
5481     UnicodeString text = "abcdef";
5482     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5483     REGEX_CHECK_STATUS;
5484     REGEX_ASSERT(m->lookingAt(status));
5485     REGEX_CHECK_STATUS;
5486     status = U_ILLEGAL_ARGUMENT_ERROR;
5487     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5488     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5489     REGEX_ASSERT(s == "");
5490     delete m;
5491 }
5492
5493 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5494
5495 void RegexTest::Bug8479() {
5496     UErrorCode status = U_ZERO_ERROR;
5497
5498     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5499     REGEX_CHECK_STATUS;
5500     if (U_SUCCESS(status))
5501     {
5502         UnicodeString str;
5503         str.setToBogus();
5504         pMatcher->reset(str);
5505         status = U_ZERO_ERROR;
5506         pMatcher->matches(status);
5507         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5508         delete pMatcher;
5509     }
5510 }
5511
5512
5513 // Bug 7029
5514 void RegexTest::Bug7029() {
5515     UErrorCode status = U_ZERO_ERROR;
5516
5517     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5518     UnicodeString text = "abc.def";
5519     UnicodeString splits[10];
5520     REGEX_CHECK_STATUS;
5521     int32_t numFields = pMatcher->split(text, splits, 10, status);
5522     REGEX_CHECK_STATUS;
5523     REGEX_ASSERT(numFields == 8);
5524     delete pMatcher;
5525 }
5526
5527 // Bug 9283
5528 //   This test is checking for the existance of any supplemental characters that case-fold
5529 //   to a bmp character.
5530 //
5531 //   At the time of this writing there are none. If any should appear in a subsequent release
5532 //   of Unicode, the code in regular expressions compilation that determines the longest
5533 //   posssible match for a literal string  will need to be enhanced.
5534 //
5535 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5536 //   for details on what to do in case of a failure of this test.
5537 //
5538 void RegexTest::Bug9283() {
5539 #if !UCONFIG_NO_NORMALIZATION
5540     UErrorCode status = U_ZERO_ERROR;
5541     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5542     REGEX_CHECK_STATUS;
5543     int32_t index;
5544     UChar32 c;
5545     for (index=0; ; index++) {
5546         c = supplementalsWithCaseFolding.charAt(index);
5547         if (c == -1) {
5548             break;
5549         }
5550         UnicodeString cf = UnicodeString(c).foldCase();
5551         REGEX_ASSERT(cf.length() >= 2);
5552     }
5553 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5554 }
5555
5556
5557 void RegexTest::CheckInvBufSize() {
5558   if(inv_next>=INV_BUFSIZ) {
5559     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5560           __FILE__, INV_BUFSIZ, inv_next);
5561   } else {
5562     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5563   }
5564 }
5565
5566
5567 void RegexTest::Bug10459() {
5568     UErrorCode status = U_ZERO_ERROR;
5569     UnicodeString patternString("(txt)");
5570     UnicodeString txtString("txt");
5571
5572     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5573     REGEX_CHECK_STATUS;
5574     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5575     REGEX_CHECK_STATUS;
5576
5577     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5578     REGEX_CHECK_STATUS;
5579
5580     uregex_setUText(icu_re, utext_txt, &status);
5581     REGEX_CHECK_STATUS;
5582
5583     // The bug was that calling uregex_group() before doing a matching operation
5584     //   was causing a segfault. Only for Regular Expressions created from UText.
5585     //   It should set an U_REGEX_INVALID_STATE.
5586
5587     UChar buf[100];
5588     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5589     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5590     REGEX_ASSERT(len == 0);
5591
5592     uregex_close(icu_re);
5593     utext_close(utext_pat);
5594     utext_close(utext_txt);
5595 }
5596
5597 void RegexTest::TestCaseInsensitiveStarters() {
5598     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5599     //  become stale because of new Unicode characters.
5600     // If it is stale, rerun the generation tool
5601     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5602     // and replace the embedded data in i18n/regexcmp.cpp
5603
5604     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5605         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5606             continue;
5607         }
5608         UnicodeSet s(cp, cp);
5609         s.closeOver(USET_CASE_INSENSITIVE);
5610         UnicodeSetIterator setIter(s);
5611         while (setIter.next()) {
5612             if (!setIter.isString()) {
5613                 continue;
5614             }
5615             const UnicodeString &str = setIter.getString();
5616             UChar32 firstChar = str.char32At(0);
5617             UnicodeSet starters;
5618             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5619             if (!starters.contains(cp)) {
5620                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5621                 return;
5622             }
5623         }
5624     }
5625 }
5626
5627
5628 void RegexTest::TestBug11049() {
5629     // Original bug report: pattern with match start consisting of one of several individual characters,
5630     //  and the text being matched ending with a supplementary character. find() would read past the
5631     //  end of the input text when searching for potential match starting points.
5632
5633     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5634     // detect the bad read.
5635
5636     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5637     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5638
5639     // Test again with a pattern starting with a single character,
5640     // which takes a different code path than starting with an OR expression,
5641     // but with similar logic.
5642     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5643     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5644 }
5645
5646 // Run a single test case from TestBug11049(). Internal function.
5647 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5648     UErrorCode status = U_ZERO_ERROR;
5649     UnicodeString patternString = UnicodeString(pattern).unescape();
5650     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5651
5652     UnicodeString dataString = UnicodeString(data).unescape();
5653     UChar *exactBuffer = new UChar[dataString.length()];
5654     dataString.extract(exactBuffer, dataString.length(), status);
5655     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5656
5657     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5658     REGEX_CHECK_STATUS;
5659     matcher->reset(ut);
5660     UBool result = matcher->find();
5661     if (result != expectMatch) {
5662         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5663               __FILE__, lineNumber, expectMatch, result, pattern, data);
5664     }
5665
5666     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5667     //   off-by-one on find() with match at the last code point.
5668     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5669     //   because string.unescape() will only shrink it.
5670     char * utf8Buffer = new char[uprv_strlen(data)+1];
5671     u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5672     REGEX_CHECK_STATUS;
5673     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5674     REGEX_CHECK_STATUS;
5675     matcher->reset(ut);
5676     result = matcher->find();
5677     if (result != expectMatch) {
5678         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5679               __FILE__, lineNumber, expectMatch, result, pattern, data);
5680     }
5681     delete [] utf8Buffer;
5682
5683     utext_close(ut);
5684     delete [] exactBuffer;
5685 }
5686
5687
5688 void RegexTest::TestBug11371() {
5689     if (quick) {
5690         logln("Skipping test. Runs in exhuastive mode only.");
5691         return;
5692     }
5693     UErrorCode status = U_ZERO_ERROR;
5694     UnicodeString patternString;
5695
5696     for (int i=0; i<8000000; i++) {
5697         patternString.append(UnicodeString("()"));
5698     }
5699     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5700     if (status != U_REGEX_PATTERN_TOO_BIG) {
5701         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5702               __FILE__, __LINE__, u_errorName(status));
5703     }
5704
5705     status = U_ZERO_ERROR;
5706     patternString = "(";
5707     for (int i=0; i<20000000; i++) {
5708         patternString.append(UnicodeString("A++"));
5709     }
5710     patternString.append(UnicodeString("){0}B++"));
5711     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5712     if (status != U_REGEX_PATTERN_TOO_BIG) {
5713         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5714               __FILE__, __LINE__, u_errorName(status));
5715     }
5716
5717     // Pattern with too much string data, such that string indexes overflow operand data field size
5718     // in compiled instruction.
5719     status = U_ZERO_ERROR;
5720     patternString = "";
5721     while (patternString.length() < 0x00ffffff) {
5722         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5723     }
5724     patternString.append(UnicodeString("X? trailing string"));
5725     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5726     if (status != U_REGEX_PATTERN_TOO_BIG) {
5727         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5728               __FILE__, __LINE__, u_errorName(status));
5729     }
5730 }
5731
5732 void RegexTest::TestBug11480() {
5733     // C API, get capture group of a group that does not participate in the match.
5734     //        (Returns a zero length string, with nul termination,
5735     //         indistinguishable from a group with a zero length match.)
5736
5737     UErrorCode status = U_ZERO_ERROR;
5738     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5739     REGEX_CHECK_STATUS;
5740     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5741     uregex_setText(re, text.getBuffer(), text.length(), &status);
5742     REGEX_CHECK_STATUS;
5743     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5744     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5745     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5746     REGEX_ASSERT(length == 0);
5747     REGEX_ASSERT(buf[0] == 13);
5748     REGEX_ASSERT(buf[1] == 0);
5749     REGEX_ASSERT(buf[2] == 13);
5750     uregex_close(re);
5751
5752     // UText C++ API, length of match is 0 for non-participating matches.
5753     UText ut = UTEXT_INITIALIZER;
5754     utext_openUnicodeString(&ut, &text, &status);
5755     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5756     REGEX_CHECK_STATUS;
5757     matcher.reset(&ut);
5758     REGEX_ASSERT(matcher.lookingAt(0, status));
5759
5760     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5761     int64_t groupLen = -666;
5762     UText group = UTEXT_INITIALIZER;
5763     matcher.group(1, &group, groupLen, status);
5764     REGEX_CHECK_STATUS;
5765     REGEX_ASSERT(groupLen == 1);
5766     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5767
5768     // Capture group 2, the (B), does not participate in the match.
5769     matcher.group(2, &group, groupLen, status);
5770     REGEX_CHECK_STATUS;
5771     REGEX_ASSERT(groupLen == 0);
5772     REGEX_ASSERT(matcher.start(2, status) == -1);
5773     REGEX_CHECK_STATUS;
5774 }
5775
5776 void RegexTest::TestBug12884() {
5777     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5778     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5779     UnicodeString text(u"hello");
5780     UErrorCode status = U_ZERO_ERROR;
5781     RegexMatcher m(pattern, text, 0, status);
5782     REGEX_CHECK_STATUS;
5783     m.setTimeLimit(5, status);
5784     m.find(status);
5785     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5786
5787     // Non-greedy loops. They take a different code path during matching.
5788     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5789     status = U_ZERO_ERROR;
5790     RegexMatcher ngM(ngPattern, text, 0, status);
5791     REGEX_CHECK_STATUS;
5792     ngM.setTimeLimit(5, status);
5793     ngM.find(status);
5794     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5795
5796     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5797     const char *text8 = u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5798                           "carácter, sin importar la plataforma, sin importar el programa,"
5799                           "sin importar el idioma.";
5800     status = U_ZERO_ERROR;
5801     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5802     REGEX_CHECK_STATUS;
5803     m.reset(ut.getAlias());
5804     m.find(status);
5805     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5806
5807     status = U_ZERO_ERROR;
5808     ngM.reset(ut.getAlias());
5809     ngM.find(status);
5810     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5811 }
5812
5813 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5814 //            can cause a read past the end of the input text.
5815 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
5816
5817 void RegexTest::TestBug13631() {
5818     const UChar *pats[] = { u"(?<!^)",
5819                             u"(?<=^)",
5820                             nullptr
5821                           };
5822     for (const UChar **pat=pats; *pat; ++pat) {
5823         UErrorCode status = U_ZERO_ERROR;
5824         UnicodeString upat(*pat);
5825         RegexMatcher matcher(upat, 0, status);
5826         const UChar s =u'a';
5827         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5828         REGEX_CHECK_STATUS;
5829         matcher.reset(ut);
5830         while (matcher.find()) {
5831         }
5832         utext_close(ut);
5833     }
5834 }
5835
5836 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5837 //           where a following group specification would be expected.
5838 //           Failure shows when running the test under Clang's Address Sanitizer.
5839
5840 void RegexTest::TestBug13632() {
5841     UErrorCode status = U_ZERO_ERROR;
5842     URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5843     const char16_t *sourceString = u"Hello, world.";
5844     uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5845
5846     const int32_t destCap = 20;
5847     char16_t dest[destCap] = {};
5848     const char16_t replacement[] = {u'x', u'$'};    // Not nul terminated string.
5849     uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5850
5851     assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5852     uregex_close(re);
5853 }
5854
5855 void RegexTest::TestBug20359() {
5856     // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5857     // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5858     // Changed implementation to loop instead of recursing.
5859
5860     UnicodeString pattern;
5861     for (int i=0; i<50000; ++i) {
5862         pattern += u"\\Q\\E";
5863     }
5864     pattern += u"x";
5865
5866     UErrorCode status = U_ZERO_ERROR;
5867     LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5868                                        0, nullptr, &status));
5869     assertSuccess(WHERE, status);
5870
5871     // We have passed the point where the bug crashed. The following is a small sanity
5872     // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5873
5874     uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5875     assertSuccess(WHERE, status);
5876     assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5877     assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5878     assertSuccess(WHERE, status);
5879 }
5880
5881 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */