icuSources/test/intltest/regextst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 2002-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8
   9 //
  10 //   regextst.cpp
  11 //
  12 //      ICU Regular Expressions test, part of intltest.
  13 //
  14
  15 /*
  16      NOTE!!
  17
  18      PLEASE be careful about ASCII assumptions in this test.
  19      This test is one of the worst repeat offenders.
  20      If you have questions, contact someone on the ICU PMC
  21      who has access to an EBCDIC system.
  22
  23  */
  24
  25 #include "intltest.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31
  32 #include "unicode/localpointer.h"
  33 #include "unicode/regex.h"
  34 #include "unicode/uchar.h"
  35 #include "unicode/ucnv.h"
  36 #include "unicode/uniset.h"
  37 #include "unicode/uregex.h"
  38 #include "unicode/usetiter.h"
  39 #include "unicode/ustring.h"
  40 #include "unicode/utext.h"
  41
  42 #include "regextst.h"
  43 #include "regexcmp.h"
  44 #include "uvector.h"
  45 #include "util.h"
  46 #include "cmemory.h"
  47 #include "cstring.h"
  48 #include "uinvchar.h"
  49
  50 #define SUPPORT_MUTATING_INPUT_STRING   0
  51
  52 //---------------------------------------------------------------------------
  53 //
  54 //  Test class boilerplate
  55 //
  56 //---------------------------------------------------------------------------
  57 RegexTest::RegexTest()
  58 {
  59 }
  60
  61
  62 RegexTest::~RegexTest()
  63 {
  64 }
  65
  66
  67
  68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  69 {
  70     if (exec) logln("TestSuite RegexTest: ");
  71     TESTCASE_AUTO_BEGIN;
  72     TESTCASE_AUTO(Basic);
  73     TESTCASE_AUTO(API_Match);
  74     TESTCASE_AUTO(API_Replace);
  75     TESTCASE_AUTO(API_Pattern);
  76 #if !UCONFIG_NO_FILE_IO
  77     TESTCASE_AUTO(Extended);
  78 #endif
  79     TESTCASE_AUTO(Errors);
  80     TESTCASE_AUTO(PerlTests);
  81     TESTCASE_AUTO(Callbacks);
  82     TESTCASE_AUTO(FindProgressCallbacks);
  83     TESTCASE_AUTO(Bug6149);
  84     TESTCASE_AUTO(UTextBasic);
  85     TESTCASE_AUTO(API_Match_UTF8);
  86     TESTCASE_AUTO(API_Replace_UTF8);
  87     TESTCASE_AUTO(API_Pattern_UTF8);
  88     TESTCASE_AUTO(PerlTestsUTF8);
  89     TESTCASE_AUTO(PreAllocatedUTextCAPI);
  90     TESTCASE_AUTO(Bug7651);
  91     TESTCASE_AUTO(Bug7740);
  92     TESTCASE_AUTO(Bug8479);
  93     TESTCASE_AUTO(Bug7029);
  94     TESTCASE_AUTO(CheckInvBufSize);
  95     TESTCASE_AUTO(Bug9283);
  96     TESTCASE_AUTO(Bug10459);
  97     TESTCASE_AUTO(TestCaseInsensitiveStarters);
  98     TESTCASE_AUTO(TestBug11049);
  99     TESTCASE_AUTO(TestBug11371);
 100     TESTCASE_AUTO(TestBug11480);
 101     TESTCASE_AUTO(NamedCapture);
 102     TESTCASE_AUTO(NamedCaptureLimits);
 103     TESTCASE_AUTO(TestBug12884);
 104     TESTCASE_AUTO_END;
 105 }
 106
 107
 108 /**
 109  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 110  * into ASCII.
 111  * @see utext_openUTF8
 112  */
 113 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 114
 115 //---------------------------------------------------------------------------
 116 //
 117 //   Error Checking / Reporting macros used in all of the tests.
 118 //
 119 //---------------------------------------------------------------------------
 120
 121 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 122   int64_t oldIndex = utext_getNativeIndex(text);
 123   utext_setNativeIndex(text, 0);
 124   char *bufPtr = buf;
 125   UChar32 c = utext_next32From(text, 0);
 126   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 127     if (0x000020<=c && c<0x00007e) {
 128       *bufPtr = c;
 129     } else {
 130 #if 0
 131       sprintf(bufPtr,"U+%04X", c);
 132       bufPtr+= strlen(bufPtr)-1;
 133 #else
 134       *bufPtr = '%';
 135 #endif
 136     }
 137     bufPtr++;
 138     c = UTEXT_NEXT32(text);
 139   }
 140   *bufPtr = 0;
 141 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 142   char *ebuf = (char*)malloc(bufLen);
 143   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 144   uprv_strncpy(buf, ebuf, bufLen);
 145   free((void*)ebuf);
 146 #endif
 147   utext_setNativeIndex(text, oldIndex);
 148 }
 149
 150
 151 static char ASSERT_BUF[1024];
 152
 153 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 154   if(message.length()==0) {
 155     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 156   } else {
 157     UnicodeString buf;
 158     IntlTest::prettify(message,buf);
 159     if(buf.length()==0) {
 160       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 161     } else {
 162       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 163       if(ASSERT_BUF[0]==0) {
 164         ASSERT_BUF[0]=0;
 165         for(int32_t i=0;i<buf.length();i++) {
 166           UChar ch = buf[i];
 167           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 168         }
 169       }
 170     }
 171   }
 172   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 173   return ASSERT_BUF;
 174 }
 175
 176 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 177
 178 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 179                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 180
 181 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 182
 183 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 184 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 185     __LINE__, u_errorName(errcode), u_errorName(status));};}
 186
 187 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 188     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 189
 190 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 191     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 192
 193 // expected: const char * , restricted to invariant characters.
 194 // actual: const UnicodeString &
 195 #define REGEX_ASSERT_UNISTR(expected, actual) { \
 196     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
 197         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
 198                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
 199
 200
 201 static UBool testUTextEqual(UText *uta, UText *utb) {
 202     UChar32 ca = 0;
 203     UChar32 cb = 0;
 204     utext_setNativeIndex(uta, 0);
 205     utext_setNativeIndex(utb, 0);
 206     do {
 207         ca = utext_next32(uta);
 208         cb = utext_next32(utb);
 209         if (ca != cb) {
 210             break;
 211         }
 212     } while (ca != U_SENTINEL);
 213     return ca == cb;
 214 }
 215
 216
 217 /**
 218  * @param expected expected text in UTF-8 (not platform) codepage
 219  */
 220 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 221     UErrorCode status = U_ZERO_ERROR;
 222     UText expectedText = UTEXT_INITIALIZER;
 223     utext_openUTF8(&expectedText, expected, -1, &status);
 224     if(U_FAILURE(status)) {
 225       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 226       return;
 227     }
 228     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 229       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 230       return;
 231     }
 232     utext_setNativeIndex(actual, 0);
 233     if (!testUTextEqual(&expectedText, actual)) {
 234         char buf[201 /*21*/];
 235         char expectedBuf[201];
 236         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 237         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 238         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 239     }
 240     utext_close(&expectedText);
 241 }
 242 /**
 243  * @param expected invariant (platform local text) input
 244  */
 245
 246 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 247     UErrorCode status = U_ZERO_ERROR;
 248     UText expectedText = UTEXT_INITIALIZER;
 249     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 250     if(U_FAILURE(status)) {
 251       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 252       return;
 253     }
 254     utext_setNativeIndex(actual, 0);
 255     if (!testUTextEqual(&expectedText, actual)) {
 256         char buf[201 /*21*/];
 257         char expectedBuf[201];
 258         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 259         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 260         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 261     }
 262     utext_close(&expectedText);
 263 }
 264
 265 /**
 266  * Assumes utf-8 input
 267  */
 268 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 269 /**
 270  * Assumes Invariant input
 271  */
 272 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 273
 274 /**
 275  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 276  * passed into utext_openUTF8. An error will be given if
 277  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 278  */
 279
 280 #define INV_BUFSIZ 2048 /* increase this if too small */
 281
 282 static int64_t inv_next=0;
 283
 284 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 285 static char inv_buf[INV_BUFSIZ];
 286 #endif
 287
 288 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 289   if(length==-1) length=strlen(inv);
 290 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 291   inv_next+=length;
 292   return utext_openUTF8(ut, inv, length, status);
 293 #else
 294   if(inv_next+length+1>INV_BUFSIZ) {
 295     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 296             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 297     *status = U_MEMORY_ALLOCATION_ERROR;
 298     return NULL;
 299   }
 300
 301   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 302   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 303   inv_next+=length;
 304
 305 #if 0
 306   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 307 #endif
 308
 309   return utext_openUTF8(ut, (const char*)buf, length, status);
 310 #endif
 311 }
 312
 313
 314 //---------------------------------------------------------------------------
 315 //
 316 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 317 //                       for the LookingAt() and  Match() functions.
 318 //
 319 //       usage:
 320 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 321 //
 322 //          The expected results are UBool - TRUE or FALSE.
 323 //          The input text is unescaped.  The pattern is not.
 324 //
 325 //
 326 //---------------------------------------------------------------------------
 327
 328 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 329
 330 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 331     const UnicodeString pattern(pat, -1, US_INV);
 332     const UnicodeString inputText(text, -1, US_INV);
 333     UErrorCode          status  = U_ZERO_ERROR;
 334     UParseError         pe;
 335     RegexPattern        *REPattern = NULL;
 336     RegexMatcher        *REMatcher = NULL;
 337     UBool               retVal     = TRUE;
 338
 339     UnicodeString patString(pat, -1, US_INV);
 340     REPattern = RegexPattern::compile(patString, 0, pe, status);
 341     if (U_FAILURE(status)) {
 342         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 343             line, u_errorName(status));
 344         return FALSE;
 345     }
 346     if (line==376) { REPattern->dumpPattern();}
 347
 348     UnicodeString inputString(inputText);
 349     UnicodeString unEscapedInput = inputString.unescape();
 350     REMatcher = REPattern->matcher(unEscapedInput, status);
 351     if (U_FAILURE(status)) {
 352         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 353             line, u_errorName(status));
 354         return FALSE;
 355     }
 356
 357     UBool actualmatch;
 358     actualmatch = REMatcher->lookingAt(status);
 359     if (U_FAILURE(status)) {
 360         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 361             line, u_errorName(status));
 362         retVal =  FALSE;
 363     }
 364     if (actualmatch != looking) {
 365         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 366         retVal = FALSE;
 367     }
 368
 369     status = U_ZERO_ERROR;
 370     actualmatch = REMatcher->matches(status);
 371     if (U_FAILURE(status)) {
 372         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 373             line, u_errorName(status));
 374         retVal = FALSE;
 375     }
 376     if (actualmatch != match) {
 377         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 378         retVal = FALSE;
 379     }
 380
 381     if (retVal == FALSE) {
 382         REPattern->dumpPattern();
 383     }
 384
 385     delete REPattern;
 386     delete REMatcher;
 387     return retVal;
 388 }
 389
 390
 391 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 392     UText               pattern    = UTEXT_INITIALIZER;
 393     int32_t             inputUTF8Length;
 394     char                *textChars = NULL;
 395     UText               inputText  = UTEXT_INITIALIZER;
 396     UErrorCode          status     = U_ZERO_ERROR;
 397     UParseError         pe;
 398     RegexPattern        *REPattern = NULL;
 399     RegexMatcher        *REMatcher = NULL;
 400     UBool               retVal     = TRUE;
 401
 402     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 403     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 404     if (U_FAILURE(status)) {
 405         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 406             line, u_errorName(status));
 407         return FALSE;
 408     }
 409
 410     UnicodeString inputString(text, -1, US_INV);
 411     UnicodeString unEscapedInput = inputString.unescape();
 412     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 413     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 414
 415     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 416     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 417         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 418         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 419         return TRUE; // not a failure of the Regex engine
 420     }
 421     status = U_ZERO_ERROR; // buffer overflow
 422     textChars = new char[inputUTF8Length+1];
 423     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 424     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 425
 426     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 427     if (U_FAILURE(status)) {
 428         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 429             line, u_errorName(status));
 430         return FALSE;
 431     }
 432
 433     UBool actualmatch;
 434     actualmatch = REMatcher->lookingAt(status);
 435     if (U_FAILURE(status)) {
 436         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 437             line, u_errorName(status));
 438         retVal =  FALSE;
 439     }
 440     if (actualmatch != looking) {
 441         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 442         retVal = FALSE;
 443     }
 444
 445     status = U_ZERO_ERROR;
 446     actualmatch = REMatcher->matches(status);
 447     if (U_FAILURE(status)) {
 448         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 449             line, u_errorName(status));
 450         retVal = FALSE;
 451     }
 452     if (actualmatch != match) {
 453         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 454         retVal = FALSE;
 455     }
 456
 457     if (retVal == FALSE) {
 458         REPattern->dumpPattern();
 459     }
 460
 461     delete REPattern;
 462     delete REMatcher;
 463     utext_close(&inputText);
 464     utext_close(&pattern);
 465     delete[] textChars;
 466     return retVal;
 467 }
 468
 469
 470
 471 //---------------------------------------------------------------------------
 472 //
 473 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 474 //                       regex tests for incorrect patterns
 475 //
 476 //       usage:
 477 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 478 //
 479 //---------------------------------------------------------------------------
 480 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 481
 482 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 483                           UErrorCode expectedStatus, int32_t line) {
 484     UnicodeString       pattern(pat);
 485
 486     UErrorCode          status         = U_ZERO_ERROR;
 487     UParseError         pe;
 488     RegexPattern        *callerPattern = NULL;
 489
 490     //
 491     //  Compile the caller's pattern
 492     //
 493     UnicodeString patString(pat);
 494     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 495     if (status != expectedStatus) {
 496         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 497     } else {
 498         if (status != U_ZERO_ERROR) {
 499             if (pe.line != errLine || pe.offset != errCol) {
 500                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 501                     line, errLine, errCol, pe.line, pe.offset);
 502             }
 503         }
 504     }
 505
 506     delete callerPattern;
 507
 508     //
 509     //  Compile again, using a UTF-8-based UText
 510     //
 511     UText patternText = UTEXT_INITIALIZER;
 512     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 513     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 514     if (status != expectedStatus) {
 515         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 516     } else {
 517         if (status != U_ZERO_ERROR) {
 518             if (pe.line != errLine || pe.offset != errCol) {
 519                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 520                     line, errLine, errCol, pe.line, pe.offset);
 521             }
 522         }
 523     }
 524
 525     delete callerPattern;
 526     utext_close(&patternText);
 527 }
 528
 529
 530
 531 //---------------------------------------------------------------------------
 532 //
 533 //      Basic      Check for basic functionality of regex pattern matching.
 534 //                 Avoid the use of REGEX_FIND test macro, which has
 535 //                 substantial dependencies on basic Regex functionality.
 536 //
 537 //---------------------------------------------------------------------------
 538 void RegexTest::Basic() {
 539
 540
 541 //
 542 // Debug - slide failing test cases early
 543 //
 544 #if 0
 545     {
 546         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 547         UParseError pe;
 548         UErrorCode  status = U_ZERO_ERROR;
 549         RegexPattern *pattern;
 550         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 551         pattern->dumpPattern();
 552         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 553         UBool result = m->find();
 554         printf("result = %d\n", result);
 555         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 556         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 557     }
 558     exit(1);
 559 #endif
 560
 561
 562     //
 563     // Pattern with parentheses
 564     //
 565     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 566     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 567     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 568
 569     //
 570     // Patterns with *
 571     //
 572     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 573     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 574     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 575     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 576     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 577
 578     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 579     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 580
 581
 582     //
 583     //  Patterns with "."
 584     //
 585     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 586     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 587     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 588     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 589     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 590     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 591     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 592     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 593
 594     //
 595     //  Patterns with * applied to chars at end of literal string
 596     //
 597     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 598     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 599
 600     //
 601     //  Supplemental chars match as single chars, not a pair of surrogates.
 602     //
 603     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 604     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 605     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 606
 607
 608     //
 609     //  UnicodeSets in the pattern
 610     //
 611     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 612     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 613     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 614     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 615     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 616     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 617
 618     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 619     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 620     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 621     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 622     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 623
 624     //
 625     //   OR operator in patterns
 626     //
 627     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 628     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 629     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 630     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 631
 632     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 633     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 634     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 635     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 636     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 637     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 638
 639     //
 640     //  +
 641     //
 642     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 643     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 644     REGEX_TESTLM("b+", "", FALSE, FALSE);
 645     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 646     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 647     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 648
 649     //
 650     //   ?
 651     //
 652     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 653     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 654     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 655     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 656     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 657     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 658     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 659     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 660     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 661
 662     //
 663     //  Escape sequences that become single literal chars, handled internally
 664     //   by ICU's Unescape.
 665     //
 666
 667     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 668     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 669     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 670     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 671     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 672     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 673     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 674     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 675     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 676     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 677
 678     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 679     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 680
 681     // Escape of special chars in patterns
 682     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 683 }
 684
 685
 686 //---------------------------------------------------------------------------
 687 //
 688 //    UTextBasic   Check for quirks that are specific to the UText
 689 //                 implementation.
 690 //
 691 //---------------------------------------------------------------------------
 692 void RegexTest::UTextBasic() {
 693     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 694     UErrorCode status = U_ZERO_ERROR;
 695     UText pattern = UTEXT_INITIALIZER;
 696     utext_openUTF8(&pattern, str_abc, -1, &status);
 697     RegexMatcher matcher(&pattern, 0, status);
 698     REGEX_CHECK_STATUS;
 699
 700     UText input = UTEXT_INITIALIZER;
 701     utext_openUTF8(&input, str_abc, -1, &status);
 702     REGEX_CHECK_STATUS;
 703     matcher.reset(&input);
 704     REGEX_CHECK_STATUS;
 705     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 706
 707     matcher.reset(matcher.inputText());
 708     REGEX_CHECK_STATUS;
 709     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 710
 711     utext_close(&pattern);
 712     utext_close(&input);
 713 }
 714
 715
 716 //---------------------------------------------------------------------------
 717 //
 718 //      API_Match   Test that the API for class RegexMatcher
 719 //                  is present and nominally working, but excluding functions
 720 //                  implementing replace operations.
 721 //
 722 //---------------------------------------------------------------------------
 723 void RegexTest::API_Match() {
 724     UParseError         pe;
 725     UErrorCode          status=U_ZERO_ERROR;
 726     int32_t             flags = 0;
 727
 728     //
 729     // Debug - slide failing test cases early
 730     //
 731 #if 0
 732     {
 733     }
 734     return;
 735 #endif
 736
 737     //
 738     // Simple pattern compilation
 739     //
 740     {
 741         UnicodeString       re("abc");
 742         RegexPattern        *pat2;
 743         pat2 = RegexPattern::compile(re, flags, pe, status);
 744         REGEX_CHECK_STATUS;
 745
 746         UnicodeString inStr1 = "abcdef this is a test";
 747         UnicodeString instr2 = "not abc";
 748         UnicodeString empty  = "";
 749
 750
 751         //
 752         // Matcher creation and reset.
 753         //
 754         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 755         REGEX_CHECK_STATUS;
 756         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 757         REGEX_ASSERT(m1->input() == inStr1);
 758         m1->reset(instr2);
 759         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 760         REGEX_ASSERT(m1->input() == instr2);
 761         m1->reset(inStr1);
 762         REGEX_ASSERT(m1->input() == inStr1);
 763         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 764         m1->reset(empty);
 765         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 766         REGEX_ASSERT(m1->input() == empty);
 767         REGEX_ASSERT(&m1->pattern() == pat2);
 768
 769         //
 770         //  reset(pos, status)
 771         //
 772         m1->reset(inStr1);
 773         m1->reset(4, status);
 774         REGEX_CHECK_STATUS;
 775         REGEX_ASSERT(m1->input() == inStr1);
 776         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 777
 778         m1->reset(-1, status);
 779         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 780         status = U_ZERO_ERROR;
 781
 782         m1->reset(0, status);
 783         REGEX_CHECK_STATUS;
 784         status = U_ZERO_ERROR;
 785
 786         int32_t len = m1->input().length();
 787         m1->reset(len-1, status);
 788         REGEX_CHECK_STATUS;
 789         status = U_ZERO_ERROR;
 790
 791         m1->reset(len, status);
 792         REGEX_CHECK_STATUS;
 793         status = U_ZERO_ERROR;
 794
 795         m1->reset(len+1, status);
 796         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 797         status = U_ZERO_ERROR;
 798
 799         //
 800         // match(pos, status)
 801         //
 802         m1->reset(instr2);
 803         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 804         m1->reset();
 805         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 806         m1->reset();
 807         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 808         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 809         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 810         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 811
 812         // Match() at end of string should fail, but should not
 813         //  be an error.
 814         status = U_ZERO_ERROR;
 815         len = m1->input().length();
 816         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 817         REGEX_CHECK_STATUS;
 818
 819         // Match beyond end of string should fail with an error.
 820         status = U_ZERO_ERROR;
 821         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 822         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 823
 824         // Successful match at end of string.
 825         {
 826             status = U_ZERO_ERROR;
 827             RegexMatcher m("A?", 0, status);  // will match zero length string.
 828             REGEX_CHECK_STATUS;
 829             m.reset(inStr1);
 830             len = inStr1.length();
 831             REGEX_ASSERT(m.matches(len, status) == TRUE);
 832             REGEX_CHECK_STATUS;
 833             m.reset(empty);
 834             REGEX_ASSERT(m.matches(0, status) == TRUE);
 835             REGEX_CHECK_STATUS;
 836         }
 837
 838
 839         //
 840         // lookingAt(pos, status)
 841         //
 842         status = U_ZERO_ERROR;
 843         m1->reset(instr2);  // "not abc"
 844         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 845         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 846         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 847         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 848         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 849         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 850         status = U_ZERO_ERROR;
 851         len = m1->input().length();
 852         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 853         REGEX_CHECK_STATUS;
 854         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 855         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 856
 857         delete m1;
 858         delete pat2;
 859     }
 860
 861
 862     //
 863     // Capture Group.
 864     //     RegexMatcher::start();
 865     //     RegexMatcher::end();
 866     //     RegexMatcher::groupCount();
 867     //
 868     {
 869         int32_t             flags=0;
 870         UParseError         pe;
 871         UErrorCode          status=U_ZERO_ERROR;
 872
 873         UnicodeString       re("01(23(45)67)(.*)");
 874         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 875         REGEX_CHECK_STATUS;
 876         UnicodeString data = "0123456789";
 877
 878         RegexMatcher *matcher = pat->matcher(data, status);
 879         REGEX_CHECK_STATUS;
 880         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 881         static const int32_t matchStarts[] = {0,  2, 4, 8};
 882         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 883         int32_t i;
 884         for (i=0; i<4; i++) {
 885             int32_t actualStart = matcher->start(i, status);
 886             REGEX_CHECK_STATUS;
 887             if (actualStart != matchStarts[i]) {
 888                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 889                     __LINE__, i, matchStarts[i], actualStart);
 890             }
 891             int32_t actualEnd = matcher->end(i, status);
 892             REGEX_CHECK_STATUS;
 893             if (actualEnd != matchEnds[i]) {
 894                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 895                     __LINE__, i, matchEnds[i], actualEnd);
 896             }
 897         }
 898
 899         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 900         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 901
 902         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 903         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 904         matcher->reset();
 905         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 906
 907         matcher->lookingAt(status);
 908         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 909         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 910         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 911         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 912         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 913         REGEX_CHECK_STATUS;
 914         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 915         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 916         matcher->reset();
 917         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 918
 919         delete matcher;
 920         delete pat;
 921
 922     }
 923
 924     //
 925     //  find
 926     //
 927     {
 928         int32_t             flags=0;
 929         UParseError         pe;
 930         UErrorCode          status=U_ZERO_ERROR;
 931
 932         UnicodeString       re("abc");
 933         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 934         REGEX_CHECK_STATUS;
 935         UnicodeString data = ".abc..abc...abc..";
 936         //                    012345678901234567
 937
 938         RegexMatcher *matcher = pat->matcher(data, status);
 939         REGEX_CHECK_STATUS;
 940         REGEX_ASSERT(matcher->find());
 941         REGEX_ASSERT(matcher->start(status) == 1);
 942         REGEX_ASSERT(matcher->find());
 943         REGEX_ASSERT(matcher->start(status) == 6);
 944         REGEX_ASSERT(matcher->find());
 945         REGEX_ASSERT(matcher->start(status) == 12);
 946         REGEX_ASSERT(matcher->find() == FALSE);
 947         REGEX_ASSERT(matcher->find() == FALSE);
 948
 949         matcher->reset();
 950         REGEX_ASSERT(matcher->find());
 951         REGEX_ASSERT(matcher->start(status) == 1);
 952
 953         REGEX_ASSERT(matcher->find(0, status));
 954         REGEX_ASSERT(matcher->start(status) == 1);
 955         REGEX_ASSERT(matcher->find(1, status));
 956         REGEX_ASSERT(matcher->start(status) == 1);
 957         REGEX_ASSERT(matcher->find(2, status));
 958         REGEX_ASSERT(matcher->start(status) == 6);
 959         REGEX_ASSERT(matcher->find(12, status));
 960         REGEX_ASSERT(matcher->start(status) == 12);
 961         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 962         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 963         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 964         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
 965
 966         status = U_ZERO_ERROR;
 967         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 968         status = U_ZERO_ERROR;
 969         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
 970
 971         REGEX_ASSERT(matcher->groupCount() == 0);
 972
 973         delete matcher;
 974         delete pat;
 975     }
 976
 977
 978     //
 979     //  find, with \G in pattern (true if at the end of a previous match).
 980     //
 981     {
 982         int32_t             flags=0;
 983         UParseError         pe;
 984         UErrorCode          status=U_ZERO_ERROR;
 985
 986         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
 987         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 988         REGEX_CHECK_STATUS;
 989         UnicodeString data = ".abcabc.abc..";
 990         //                    012345678901234567
 991
 992         RegexMatcher *matcher = pat->matcher(data, status);
 993         REGEX_CHECK_STATUS;
 994         REGEX_ASSERT(matcher->find());
 995         REGEX_ASSERT(matcher->start(status) == 0);
 996         REGEX_ASSERT(matcher->start(1, status) == -1);
 997         REGEX_ASSERT(matcher->start(2, status) == 1);
 998
 999         REGEX_ASSERT(matcher->find());
1000         REGEX_ASSERT(matcher->start(status) == 4);
1001         REGEX_ASSERT(matcher->start(1, status) == 4);
1002         REGEX_ASSERT(matcher->start(2, status) == -1);
1003         REGEX_CHECK_STATUS;
1004
1005         delete matcher;
1006         delete pat;
1007     }
1008
1009     //
1010     //   find with zero length matches, match position should bump ahead
1011     //     to prevent loops.
1012     //
1013     {
1014         int32_t                 i;
1015         UErrorCode          status=U_ZERO_ERROR;
1016         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1017                                                       //   using an always-true look-ahead.
1018         REGEX_CHECK_STATUS;
1019         UnicodeString s("    ");
1020         m.reset(s);
1021         for (i=0; ; i++) {
1022             if (m.find() == FALSE) {
1023                 break;
1024             }
1025             REGEX_ASSERT(m.start(status) == i);
1026             REGEX_ASSERT(m.end(status) == i);
1027         }
1028         REGEX_ASSERT(i==5);
1029
1030         // Check that the bump goes over surrogate pairs OK
1031         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1032         s = s.unescape();
1033         m.reset(s);
1034         for (i=0; ; i+=2) {
1035             if (m.find() == FALSE) {
1036                 break;
1037             }
1038             REGEX_ASSERT(m.start(status) == i);
1039             REGEX_ASSERT(m.end(status) == i);
1040         }
1041         REGEX_ASSERT(i==10);
1042     }
1043     {
1044         // find() loop breaking test.
1045         //        with pattern of /.?/, should see a series of one char matches, then a single
1046         //        match of zero length at the end of the input string.
1047         int32_t                 i;
1048         UErrorCode          status=U_ZERO_ERROR;
1049         RegexMatcher        m(".?", 0, status);
1050         REGEX_CHECK_STATUS;
1051         UnicodeString s("    ");
1052         m.reset(s);
1053         for (i=0; ; i++) {
1054             if (m.find() == FALSE) {
1055                 break;
1056             }
1057             REGEX_ASSERT(m.start(status) == i);
1058             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1059         }
1060         REGEX_ASSERT(i==5);
1061     }
1062
1063
1064     //
1065     // Matchers with no input string behave as if they had an empty input string.
1066     //
1067
1068     {
1069         UErrorCode status = U_ZERO_ERROR;
1070         RegexMatcher  m(".?", 0, status);
1071         REGEX_CHECK_STATUS;
1072         REGEX_ASSERT(m.find());
1073         REGEX_ASSERT(m.start(status) == 0);
1074         REGEX_ASSERT(m.input() == "");
1075     }
1076     {
1077         UErrorCode status = U_ZERO_ERROR;
1078         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1079         RegexMatcher  *m = p->matcher(status);
1080         REGEX_CHECK_STATUS;
1081
1082         REGEX_ASSERT(m->find() == FALSE);
1083         REGEX_ASSERT(m->input() == "");
1084         delete m;
1085         delete p;
1086     }
1087
1088     //
1089     // Regions
1090     //
1091     {
1092         UErrorCode status = U_ZERO_ERROR;
1093         UnicodeString testString("This is test data");
1094         RegexMatcher m(".*", testString,  0, status);
1095         REGEX_CHECK_STATUS;
1096         REGEX_ASSERT(m.regionStart() == 0);
1097         REGEX_ASSERT(m.regionEnd() == testString.length());
1098         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1099         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1100
1101         m.region(2,4, status);
1102         REGEX_CHECK_STATUS;
1103         REGEX_ASSERT(m.matches(status));
1104         REGEX_ASSERT(m.start(status)==2);
1105         REGEX_ASSERT(m.end(status)==4);
1106         REGEX_CHECK_STATUS;
1107
1108         m.reset();
1109         REGEX_ASSERT(m.regionStart() == 0);
1110         REGEX_ASSERT(m.regionEnd() == testString.length());
1111
1112         UnicodeString shorterString("short");
1113         m.reset(shorterString);
1114         REGEX_ASSERT(m.regionStart() == 0);
1115         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1116
1117         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1118         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1119         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1120         REGEX_ASSERT(&m == &m.reset());
1121         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1122
1123         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1124         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1125         REGEX_ASSERT(&m == &m.reset());
1126         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1127
1128         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1130         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1131         REGEX_ASSERT(&m == &m.reset());
1132         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1133
1134         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1135         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1136         REGEX_ASSERT(&m == &m.reset());
1137         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1138
1139     }
1140
1141     //
1142     // hitEnd() and requireEnd()
1143     //
1144     {
1145         UErrorCode status = U_ZERO_ERROR;
1146         UnicodeString testString("aabb");
1147         RegexMatcher m1(".*", testString,  0, status);
1148         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1149         REGEX_ASSERT(m1.hitEnd() == TRUE);
1150         REGEX_ASSERT(m1.requireEnd() == FALSE);
1151         REGEX_CHECK_STATUS;
1152
1153         status = U_ZERO_ERROR;
1154         RegexMatcher m2("a*", testString, 0, status);
1155         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1156         REGEX_ASSERT(m2.hitEnd() == FALSE);
1157         REGEX_ASSERT(m2.requireEnd() == FALSE);
1158         REGEX_CHECK_STATUS;
1159
1160         status = U_ZERO_ERROR;
1161         RegexMatcher m3(".*$", testString, 0, status);
1162         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1163         REGEX_ASSERT(m3.hitEnd() == TRUE);
1164         REGEX_ASSERT(m3.requireEnd() == TRUE);
1165         REGEX_CHECK_STATUS;
1166     }
1167
1168
1169     //
1170     // Compilation error on reset with UChar *
1171     //   These were a hazard that people were stumbling over with runtime errors.
1172     //   Changed them to compiler errors by adding private methods that more closely
1173     //   matched the incorrect use of the functions.
1174     //
1175 #if 0
1176     {
1177         UErrorCode status = U_ZERO_ERROR;
1178         UChar ucharString[20];
1179         RegexMatcher m(".", 0, status);
1180         m.reset(ucharString);  // should not compile.
1181
1182         RegexPattern *p = RegexPattern::compile(".", 0, status);
1183         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1184
1185         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1186     }
1187 #endif
1188
1189     //
1190     //  Time Outs.
1191     //       Note:  These tests will need to be changed when the regexp engine is
1192     //              able to detect and cut short the exponential time behavior on
1193     //              this type of match.
1194     //
1195     {
1196         UErrorCode status = U_ZERO_ERROR;
1197         //    Enough 'a's in the string to cause the match to time out.
1198         //       (Each on additonal 'a' doubles the time)
1199         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1200         RegexMatcher matcher("(a+)+b", testString, 0, status);
1201         REGEX_CHECK_STATUS;
1202         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1203         matcher.setTimeLimit(100, status);
1204         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1205         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1206         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1207     }
1208     {
1209         UErrorCode status = U_ZERO_ERROR;
1210         //   Few enough 'a's to slip in under the time limit.
1211         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1212         RegexMatcher matcher("(a+)+b", testString, 0, status);
1213         REGEX_CHECK_STATUS;
1214         matcher.setTimeLimit(100, status);
1215         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1216         REGEX_CHECK_STATUS;
1217     }
1218
1219     //
1220     //  Stack Limits
1221     //
1222     {
1223         UErrorCode status = U_ZERO_ERROR;
1224         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1225
1226         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1227         //   of the '+', and makes the stack frames larger.
1228         RegexMatcher matcher("(A)+A$", testString, 0, status);
1229
1230         // With the default stack, this match should fail to run
1231         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1232         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1233
1234         // With unlimited stack, it should run
1235         status = U_ZERO_ERROR;
1236         matcher.setStackLimit(0, status);
1237         REGEX_CHECK_STATUS;
1238         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1239         REGEX_CHECK_STATUS;
1240         REGEX_ASSERT(matcher.getStackLimit() == 0);
1241
1242         // With a limited stack, it the match should fail
1243         status = U_ZERO_ERROR;
1244         matcher.setStackLimit(10000, status);
1245         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1247         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1248     }
1249
1250         // A pattern that doesn't save state should work with
1251         //   a minimal sized stack
1252     {
1253         UErrorCode status = U_ZERO_ERROR;
1254         UnicodeString testString = "abc";
1255         RegexMatcher matcher("abc", testString, 0, status);
1256         REGEX_CHECK_STATUS;
1257         matcher.setStackLimit(30, status);
1258         REGEX_CHECK_STATUS;
1259         REGEX_ASSERT(matcher.matches(status) == TRUE);
1260         REGEX_CHECK_STATUS;
1261         REGEX_ASSERT(matcher.getStackLimit() == 30);
1262
1263         // Negative stack sizes should fail
1264         status = U_ZERO_ERROR;
1265         matcher.setStackLimit(1000, status);
1266         REGEX_CHECK_STATUS;
1267         matcher.setStackLimit(-1, status);
1268         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1269         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1270     }
1271
1272
1273 }
1274
1275
1276
1277
1278
1279
1280 //---------------------------------------------------------------------------
1281 //
1282 //      API_Replace        API test for class RegexMatcher, testing the
1283 //                         Replace family of functions.
1284 //
1285 //---------------------------------------------------------------------------
1286 void RegexTest::API_Replace() {
1287     //
1288     //  Replace
1289     //
1290     int32_t             flags=0;
1291     UParseError         pe;
1292     UErrorCode          status=U_ZERO_ERROR;
1293
1294     UnicodeString       re("abc");
1295     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1296     REGEX_CHECK_STATUS;
1297     UnicodeString data = ".abc..abc...abc..";
1298     //                    012345678901234567
1299     RegexMatcher *matcher = pat->matcher(data, status);
1300
1301     //
1302     //  Plain vanilla matches.
1303     //
1304     UnicodeString  dest;
1305     dest = matcher->replaceFirst("yz", status);
1306     REGEX_CHECK_STATUS;
1307     REGEX_ASSERT(dest == ".yz..abc...abc..");
1308
1309     dest = matcher->replaceAll("yz", status);
1310     REGEX_CHECK_STATUS;
1311     REGEX_ASSERT(dest == ".yz..yz...yz..");
1312
1313     //
1314     //  Plain vanilla non-matches.
1315     //
1316     UnicodeString d2 = ".abx..abx...abx..";
1317     matcher->reset(d2);
1318     dest = matcher->replaceFirst("yz", status);
1319     REGEX_CHECK_STATUS;
1320     REGEX_ASSERT(dest == ".abx..abx...abx..");
1321
1322     dest = matcher->replaceAll("yz", status);
1323     REGEX_CHECK_STATUS;
1324     REGEX_ASSERT(dest == ".abx..abx...abx..");
1325
1326     //
1327     // Empty source string
1328     //
1329     UnicodeString d3 = "";
1330     matcher->reset(d3);
1331     dest = matcher->replaceFirst("yz", status);
1332     REGEX_CHECK_STATUS;
1333     REGEX_ASSERT(dest == "");
1334
1335     dest = matcher->replaceAll("yz", status);
1336     REGEX_CHECK_STATUS;
1337     REGEX_ASSERT(dest == "");
1338
1339     //
1340     // Empty substitution string
1341     //
1342     matcher->reset(data);              // ".abc..abc...abc.."
1343     dest = matcher->replaceFirst("", status);
1344     REGEX_CHECK_STATUS;
1345     REGEX_ASSERT(dest == "...abc...abc..");
1346
1347     dest = matcher->replaceAll("", status);
1348     REGEX_CHECK_STATUS;
1349     REGEX_ASSERT(dest == "........");
1350
1351     //
1352     // match whole string
1353     //
1354     UnicodeString d4 = "abc";
1355     matcher->reset(d4);
1356     dest = matcher->replaceFirst("xyz", status);
1357     REGEX_CHECK_STATUS;
1358     REGEX_ASSERT(dest == "xyz");
1359
1360     dest = matcher->replaceAll("xyz", status);
1361     REGEX_CHECK_STATUS;
1362     REGEX_ASSERT(dest == "xyz");
1363
1364     //
1365     // Capture Group, simple case
1366     //
1367     UnicodeString       re2("a(..)");
1368     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1369     REGEX_CHECK_STATUS;
1370     UnicodeString d5 = "abcdefg";
1371     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1372     REGEX_CHECK_STATUS;
1373     dest = matcher2->replaceFirst("$1$1", status);
1374     REGEX_CHECK_STATUS;
1375     REGEX_ASSERT(dest == "bcbcdefg");
1376
1377     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1378     REGEX_CHECK_STATUS;
1379     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1380
1381     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1382     REGEX_ASSERT(U_FAILURE(status));
1383     status = U_ZERO_ERROR;
1384
1385     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1386     replacement = replacement.unescape();
1387     dest = matcher2->replaceFirst(replacement, status);
1388     REGEX_CHECK_STATUS;
1389     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1390
1391     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1392
1393
1394     //
1395     // Replacement String with \u hex escapes
1396     //
1397     {
1398         UnicodeString  src = "abc 1 abc 2 abc 3";
1399         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1400         matcher->reset(src);
1401         UnicodeString  result = matcher->replaceAll(substitute, status);
1402         REGEX_CHECK_STATUS;
1403         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1404     }
1405     {
1406         UnicodeString  src = "abc !";
1407         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1408         matcher->reset(src);
1409         UnicodeString  result = matcher->replaceAll(substitute, status);
1410         REGEX_CHECK_STATUS;
1411         UnicodeString expected = UnicodeString("--");
1412         expected.append((UChar32)0x10000);
1413         expected.append("-- !");
1414         REGEX_ASSERT(result == expected);
1415     }
1416     // TODO:  need more through testing of capture substitutions.
1417
1418     // Bug 4057
1419     //
1420     {
1421         status = U_ZERO_ERROR;
1422         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1423         RegexMatcher m("ss(.*?)ee", 0, status);
1424         REGEX_CHECK_STATUS;
1425         UnicodeString result;
1426
1427         // Multiple finds do NOT bump up the previous appendReplacement postion.
1428         m.reset(s);
1429         m.find();
1430         m.find();
1431         m.appendReplacement(result, "ooh", status);
1432         REGEX_CHECK_STATUS;
1433         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1434
1435         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1436         status = U_ZERO_ERROR;
1437         result.truncate(0);
1438         m.reset(10, status);
1439         m.find();
1440         m.find();
1441         m.appendReplacement(result, "ooh", status);
1442         REGEX_CHECK_STATUS;
1443         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1444
1445         // find() at interior of string, appendReplacemnt still starts at beginning.
1446         status = U_ZERO_ERROR;
1447         result.truncate(0);
1448         m.reset();
1449         m.find(10, status);
1450         m.find();
1451         m.appendReplacement(result, "ooh", status);
1452         REGEX_CHECK_STATUS;
1453         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1454
1455         m.appendTail(result);
1456         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1457
1458     }
1459
1460     delete matcher2;
1461     delete pat2;
1462     delete matcher;
1463     delete pat;
1464 }
1465
1466
1467 //---------------------------------------------------------------------------
1468 //
1469 //      API_Pattern       Test that the API for class RegexPattern is
1470 //                        present and nominally working.
1471 //
1472 //---------------------------------------------------------------------------
1473 void RegexTest::API_Pattern() {
1474     RegexPattern        pata;    // Test default constructor to not crash.
1475     RegexPattern        patb;
1476
1477     REGEX_ASSERT(pata == patb);
1478     REGEX_ASSERT(pata == pata);
1479
1480     UnicodeString re1("abc[a-l][m-z]");
1481     UnicodeString re2("def");
1482     UErrorCode    status = U_ZERO_ERROR;
1483     UParseError   pe;
1484
1485     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1486     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1487     REGEX_CHECK_STATUS;
1488     REGEX_ASSERT(*pat1 == *pat1);
1489     REGEX_ASSERT(*pat1 != pata);
1490
1491     // Assign
1492     patb = *pat1;
1493     REGEX_ASSERT(patb == *pat1);
1494
1495     // Copy Construct
1496     RegexPattern patc(*pat1);
1497     REGEX_ASSERT(patc == *pat1);
1498     REGEX_ASSERT(patb == patc);
1499     REGEX_ASSERT(pat1 != pat2);
1500     patb = *pat2;
1501     REGEX_ASSERT(patb != patc);
1502     REGEX_ASSERT(patb == *pat2);
1503
1504     // Compile with no flags.
1505     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1506     REGEX_ASSERT(*pat1a == *pat1);
1507
1508     REGEX_ASSERT(pat1a->flags() == 0);
1509
1510     // Compile with different flags should be not equal
1511     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1512     REGEX_CHECK_STATUS;
1513
1514     REGEX_ASSERT(*pat1b != *pat1a);
1515     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1516     REGEX_ASSERT(pat1a->flags() == 0);
1517     delete pat1b;
1518
1519     // clone
1520     RegexPattern *pat1c = pat1->clone();
1521     REGEX_ASSERT(*pat1c == *pat1);
1522     REGEX_ASSERT(*pat1c != *pat2);
1523
1524     delete pat1c;
1525     delete pat1a;
1526     delete pat1;
1527     delete pat2;
1528
1529
1530     //
1531     //   Verify that a matcher created from a cloned pattern works.
1532     //     (Jitterbug 3423)
1533     //
1534     {
1535         UErrorCode     status     = U_ZERO_ERROR;
1536         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1537         RegexPattern  *pClone     = pSource->clone();
1538         delete         pSource;
1539         RegexMatcher  *mFromClone = pClone->matcher(status);
1540         REGEX_CHECK_STATUS;
1541         UnicodeString s = "Hello World";
1542         mFromClone->reset(s);
1543         REGEX_ASSERT(mFromClone->find() == TRUE);
1544         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1545         REGEX_ASSERT(mFromClone->find() == TRUE);
1546         REGEX_ASSERT(mFromClone->group(status) == "World");
1547         REGEX_ASSERT(mFromClone->find() == FALSE);
1548         delete mFromClone;
1549         delete pClone;
1550     }
1551
1552     //
1553     //   matches convenience API
1554     //
1555     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1556     REGEX_CHECK_STATUS;
1557     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1558     REGEX_CHECK_STATUS;
1559     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1560     REGEX_CHECK_STATUS;
1561     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1562     REGEX_CHECK_STATUS;
1563     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1564     REGEX_CHECK_STATUS;
1565     status = U_INDEX_OUTOFBOUNDS_ERROR;
1566     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1567     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1568
1569
1570     //
1571     // Split()
1572     //
1573     status = U_ZERO_ERROR;
1574     pat1 = RegexPattern::compile(" +",  pe, status);
1575     REGEX_CHECK_STATUS;
1576     UnicodeString  fields[10];
1577
1578     int32_t n;
1579     n = pat1->split("Now is the time", fields, 10, status);
1580     REGEX_CHECK_STATUS;
1581     REGEX_ASSERT(n==4);
1582     REGEX_ASSERT(fields[0]=="Now");
1583     REGEX_ASSERT(fields[1]=="is");
1584     REGEX_ASSERT(fields[2]=="the");
1585     REGEX_ASSERT(fields[3]=="time");
1586     REGEX_ASSERT(fields[4]=="");
1587
1588     n = pat1->split("Now is the time", fields, 2, status);
1589     REGEX_CHECK_STATUS;
1590     REGEX_ASSERT(n==2);
1591     REGEX_ASSERT(fields[0]=="Now");
1592     REGEX_ASSERT(fields[1]=="is the time");
1593     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1594
1595     fields[1] = "*";
1596     status = U_ZERO_ERROR;
1597     n = pat1->split("Now is the time", fields, 1, status);
1598     REGEX_CHECK_STATUS;
1599     REGEX_ASSERT(n==1);
1600     REGEX_ASSERT(fields[0]=="Now is the time");
1601     REGEX_ASSERT(fields[1]=="*");
1602     status = U_ZERO_ERROR;
1603
1604     n = pat1->split("    Now       is the time   ", fields, 10, status);
1605     REGEX_CHECK_STATUS;
1606     REGEX_ASSERT(n==6);
1607     REGEX_ASSERT(fields[0]=="");
1608     REGEX_ASSERT(fields[1]=="Now");
1609     REGEX_ASSERT(fields[2]=="is");
1610     REGEX_ASSERT(fields[3]=="the");
1611     REGEX_ASSERT(fields[4]=="time");
1612     REGEX_ASSERT(fields[5]=="");
1613
1614     n = pat1->split("     ", fields, 10, status);
1615     REGEX_CHECK_STATUS;
1616     REGEX_ASSERT(n==2);
1617     REGEX_ASSERT(fields[0]=="");
1618     REGEX_ASSERT(fields[1]=="");
1619
1620     fields[0] = "foo";
1621     n = pat1->split("", fields, 10, status);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(n==0);
1624     REGEX_ASSERT(fields[0]=="foo");
1625
1626     delete pat1;
1627
1628     //  split, with a pattern with (capture)
1629     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1630     REGEX_CHECK_STATUS;
1631
1632     status = U_ZERO_ERROR;
1633     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1634     REGEX_CHECK_STATUS;
1635     REGEX_ASSERT(n==7);
1636     REGEX_ASSERT(fields[0]=="");
1637     REGEX_ASSERT(fields[1]=="a");
1638     REGEX_ASSERT(fields[2]=="Now is ");
1639     REGEX_ASSERT(fields[3]=="b");
1640     REGEX_ASSERT(fields[4]=="the time");
1641     REGEX_ASSERT(fields[5]=="c");
1642     REGEX_ASSERT(fields[6]=="");
1643     REGEX_ASSERT(status==U_ZERO_ERROR);
1644
1645     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1646     REGEX_CHECK_STATUS;
1647     REGEX_ASSERT(n==7);
1648     REGEX_ASSERT(fields[0]=="  ");
1649     REGEX_ASSERT(fields[1]=="a");
1650     REGEX_ASSERT(fields[2]=="Now is ");
1651     REGEX_ASSERT(fields[3]=="b");
1652     REGEX_ASSERT(fields[4]=="the time");
1653     REGEX_ASSERT(fields[5]=="c");
1654     REGEX_ASSERT(fields[6]=="");
1655
1656     status = U_ZERO_ERROR;
1657     fields[6] = "foo";
1658     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1659     REGEX_CHECK_STATUS;
1660     REGEX_ASSERT(n==6);
1661     REGEX_ASSERT(fields[0]=="  ");
1662     REGEX_ASSERT(fields[1]=="a");
1663     REGEX_ASSERT(fields[2]=="Now is ");
1664     REGEX_ASSERT(fields[3]=="b");
1665     REGEX_ASSERT(fields[4]=="the time");
1666     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1667     REGEX_ASSERT(fields[6]=="foo");
1668
1669     status = U_ZERO_ERROR;
1670     fields[5] = "foo";
1671     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1672     REGEX_CHECK_STATUS;
1673     REGEX_ASSERT(n==5);
1674     REGEX_ASSERT(fields[0]=="  ");
1675     REGEX_ASSERT(fields[1]=="a");
1676     REGEX_ASSERT(fields[2]=="Now is ");
1677     REGEX_ASSERT(fields[3]=="b");
1678     REGEX_ASSERT(fields[4]=="the time<c>");
1679     REGEX_ASSERT(fields[5]=="foo");
1680
1681     status = U_ZERO_ERROR;
1682     fields[5] = "foo";
1683     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1684     REGEX_CHECK_STATUS;
1685     REGEX_ASSERT(n==5);
1686     REGEX_ASSERT(fields[0]=="  ");
1687     REGEX_ASSERT(fields[1]=="a");
1688     REGEX_ASSERT(fields[2]=="Now is ");
1689     REGEX_ASSERT(fields[3]=="b");
1690     REGEX_ASSERT(fields[4]=="the time");
1691     REGEX_ASSERT(fields[5]=="foo");
1692
1693     status = U_ZERO_ERROR;
1694     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1695     REGEX_CHECK_STATUS;
1696     REGEX_ASSERT(n==4);
1697     REGEX_ASSERT(fields[0]=="  ");
1698     REGEX_ASSERT(fields[1]=="a");
1699     REGEX_ASSERT(fields[2]=="Now is ");
1700     REGEX_ASSERT(fields[3]=="the time<c>");
1701     status = U_ZERO_ERROR;
1702     delete pat1;
1703
1704     pat1 = RegexPattern::compile("([-,])",  pe, status);
1705     REGEX_CHECK_STATUS;
1706     n = pat1->split("1-10,20", fields, 10, status);
1707     REGEX_CHECK_STATUS;
1708     REGEX_ASSERT(n==5);
1709     REGEX_ASSERT(fields[0]=="1");
1710     REGEX_ASSERT(fields[1]=="-");
1711     REGEX_ASSERT(fields[2]=="10");
1712     REGEX_ASSERT(fields[3]==",");
1713     REGEX_ASSERT(fields[4]=="20");
1714     delete pat1;
1715
1716     // Test split of string with empty trailing fields
1717     pat1 = RegexPattern::compile(",", pe, status);
1718     REGEX_CHECK_STATUS;
1719     n = pat1->split("a,b,c,", fields, 10, status);
1720     REGEX_CHECK_STATUS;
1721     REGEX_ASSERT(n==4);
1722     REGEX_ASSERT(fields[0]=="a");
1723     REGEX_ASSERT(fields[1]=="b");
1724     REGEX_ASSERT(fields[2]=="c");
1725     REGEX_ASSERT(fields[3]=="");
1726
1727     n = pat1->split("a,,,", fields, 10, status);
1728     REGEX_CHECK_STATUS;
1729     REGEX_ASSERT(n==4);
1730     REGEX_ASSERT(fields[0]=="a");
1731     REGEX_ASSERT(fields[1]=="");
1732     REGEX_ASSERT(fields[2]=="");
1733     REGEX_ASSERT(fields[3]=="");
1734     delete pat1;
1735
1736     // Split Separator with zero length match.
1737     pat1 = RegexPattern::compile(":?", pe, status);
1738     REGEX_CHECK_STATUS;
1739     n = pat1->split("abc", fields, 10, status);
1740     REGEX_CHECK_STATUS;
1741     REGEX_ASSERT(n==5);
1742     REGEX_ASSERT(fields[0]=="");
1743     REGEX_ASSERT(fields[1]=="a");
1744     REGEX_ASSERT(fields[2]=="b");
1745     REGEX_ASSERT(fields[3]=="c");
1746     REGEX_ASSERT(fields[4]=="");
1747
1748     delete pat1;
1749
1750     //
1751     // RegexPattern::pattern()
1752     //
1753     pat1 = new RegexPattern();
1754     REGEX_ASSERT(pat1->pattern() == "");
1755     delete pat1;
1756
1757     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1758     REGEX_CHECK_STATUS;
1759     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1760     delete pat1;
1761
1762
1763     //
1764     // classID functions
1765     //
1766     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1767     REGEX_CHECK_STATUS;
1768     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1769     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1770     UnicodeString Hello("Hello, world.");
1771     RegexMatcher *m = pat1->matcher(Hello, status);
1772     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1773     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1774     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1775     delete m;
1776     delete pat1;
1777
1778 }
1779
1780 //---------------------------------------------------------------------------
1781 //
1782 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1783 //                       is present and working, but excluding functions
1784 //                       implementing replace operations.
1785 //
1786 //---------------------------------------------------------------------------
1787 void RegexTest::API_Match_UTF8() {
1788     UParseError         pe;
1789     UErrorCode          status=U_ZERO_ERROR;
1790     int32_t             flags = 0;
1791
1792     //
1793     // Debug - slide failing test cases early
1794     //
1795 #if 0
1796     {
1797     }
1798     return;
1799 #endif
1800
1801     //
1802     // Simple pattern compilation
1803     //
1804     {
1805         UText               re = UTEXT_INITIALIZER;
1806         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1807         REGEX_VERBOSE_TEXT(&re);
1808         RegexPattern        *pat2;
1809         pat2 = RegexPattern::compile(&re, flags, pe, status);
1810         REGEX_CHECK_STATUS;
1811
1812         UText input1 = UTEXT_INITIALIZER;
1813         UText input2 = UTEXT_INITIALIZER;
1814         UText empty  = UTEXT_INITIALIZER;
1815         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1816         REGEX_VERBOSE_TEXT(&input1);
1817         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1818         REGEX_VERBOSE_TEXT(&input2);
1819         utext_openUChars(&empty, NULL, 0, &status);
1820
1821         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1822         int32_t input2Len = strlen("not abc");
1823
1824
1825         //
1826         // Matcher creation and reset.
1827         //
1828         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1829         REGEX_CHECK_STATUS;
1830         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1831         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1832         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1833         m1->reset(&input2);
1834         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1835         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1836         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1837         m1->reset(&input1);
1838         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1839         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1840         m1->reset(&empty);
1841         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1842         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1843
1844         //
1845         //  reset(pos, status)
1846         //
1847         m1->reset(&input1);
1848         m1->reset(4, status);
1849         REGEX_CHECK_STATUS;
1850         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1851         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1852
1853         m1->reset(-1, status);
1854         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1855         status = U_ZERO_ERROR;
1856
1857         m1->reset(0, status);
1858         REGEX_CHECK_STATUS;
1859         status = U_ZERO_ERROR;
1860
1861         m1->reset(input1Len-1, status);
1862         REGEX_CHECK_STATUS;
1863         status = U_ZERO_ERROR;
1864
1865         m1->reset(input1Len, status);
1866         REGEX_CHECK_STATUS;
1867         status = U_ZERO_ERROR;
1868
1869         m1->reset(input1Len+1, status);
1870         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1871         status = U_ZERO_ERROR;
1872
1873         //
1874         // match(pos, status)
1875         //
1876         m1->reset(&input2);
1877         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1878         m1->reset();
1879         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1880         m1->reset();
1881         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1882         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1883         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885
1886         // Match() at end of string should fail, but should not
1887         //  be an error.
1888         status = U_ZERO_ERROR;
1889         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1890         REGEX_CHECK_STATUS;
1891
1892         // Match beyond end of string should fail with an error.
1893         status = U_ZERO_ERROR;
1894         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1895         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1896
1897         // Successful match at end of string.
1898         {
1899             status = U_ZERO_ERROR;
1900             RegexMatcher m("A?", 0, status);  // will match zero length string.
1901             REGEX_CHECK_STATUS;
1902             m.reset(&input1);
1903             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1904             REGEX_CHECK_STATUS;
1905             m.reset(&empty);
1906             REGEX_ASSERT(m.matches(0, status) == TRUE);
1907             REGEX_CHECK_STATUS;
1908         }
1909
1910
1911         //
1912         // lookingAt(pos, status)
1913         //
1914         status = U_ZERO_ERROR;
1915         m1->reset(&input2);  // "not abc"
1916         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1917         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1918         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1919         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1920         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1921         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1922         status = U_ZERO_ERROR;
1923         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1924         REGEX_CHECK_STATUS;
1925         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1926         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1927
1928         delete m1;
1929         delete pat2;
1930
1931         utext_close(&re);
1932         utext_close(&input1);
1933         utext_close(&input2);
1934         utext_close(&empty);
1935     }
1936
1937
1938     //
1939     // Capture Group.
1940     //     RegexMatcher::start();
1941     //     RegexMatcher::end();
1942     //     RegexMatcher::groupCount();
1943     //
1944     {
1945         int32_t             flags=0;
1946         UParseError         pe;
1947         UErrorCode          status=U_ZERO_ERROR;
1948         UText               re=UTEXT_INITIALIZER;
1949         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1950         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1951
1952         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1953         REGEX_CHECK_STATUS;
1954
1955         UText input = UTEXT_INITIALIZER;
1956         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1957         utext_openUTF8(&input, str_0123456789, -1, &status);
1958
1959         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1960         REGEX_CHECK_STATUS;
1961         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1962         static const int32_t matchStarts[] = {0,  2, 4, 8};
1963         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1964         int32_t i;
1965         for (i=0; i<4; i++) {
1966             int32_t actualStart = matcher->start(i, status);
1967             REGEX_CHECK_STATUS;
1968             if (actualStart != matchStarts[i]) {
1969                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1970                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1971             }
1972             int32_t actualEnd = matcher->end(i, status);
1973             REGEX_CHECK_STATUS;
1974             if (actualEnd != matchEnds[i]) {
1975                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1976                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1977             }
1978         }
1979
1980         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1981         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1982
1983         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1984         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1985         matcher->reset();
1986         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1987
1988         matcher->lookingAt(status);
1989
1990         UnicodeString dest;
1991         UText destText = UTEXT_INITIALIZER;
1992         utext_openUnicodeString(&destText, &dest, &status);
1993         UText *result;
1994         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1995         //  Test shallow-clone API
1996         int64_t   group_len;
1997         result = matcher->group((UText *)NULL, group_len, status);
1998         REGEX_CHECK_STATUS;
1999         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2000         utext_close(result);
2001         result = matcher->group(0, &destText, group_len, status);
2002         REGEX_CHECK_STATUS;
2003         REGEX_ASSERT(result == &destText);
2004         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2005         //  destText is now immutable, reopen it
2006         utext_close(&destText);
2007         utext_openUnicodeString(&destText, &dest, &status);
2008
2009         int64_t length;
2010         result = matcher->group(0, NULL, length, status);
2011         REGEX_CHECK_STATUS;
2012         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2013         utext_close(result);
2014         result = matcher->group(0, &destText, length, status);
2015         REGEX_CHECK_STATUS;
2016         REGEX_ASSERT(result == &destText);
2017         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2018         REGEX_ASSERT(length == 10);
2019         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2020
2021         // Capture Group 1 == "234567"
2022         result = matcher->group(1, NULL, length, status);
2023         REGEX_CHECK_STATUS;
2024         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2025         REGEX_ASSERT(length == 6);
2026         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2027         utext_close(result);
2028
2029         result = matcher->group(1, &destText, length, status);
2030         REGEX_CHECK_STATUS;
2031         REGEX_ASSERT(result == &destText);
2032         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2033         REGEX_ASSERT(length == 6);
2034         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2035         utext_close(result);
2036
2037         // Capture Group 2 == "45"
2038         result = matcher->group(2, NULL, length, status);
2039         REGEX_CHECK_STATUS;
2040         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2041         REGEX_ASSERT(length == 2);
2042         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2043         utext_close(result);
2044
2045         result = matcher->group(2, &destText, length, status);
2046         REGEX_CHECK_STATUS;
2047         REGEX_ASSERT(result == &destText);
2048         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2049         REGEX_ASSERT(length == 2);
2050         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2051         utext_close(result);
2052
2053         // Capture Group 3 == "89"
2054         result = matcher->group(3, NULL, length, status);
2055         REGEX_CHECK_STATUS;
2056         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2057         REGEX_ASSERT(length == 2);
2058         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2059         utext_close(result);
2060
2061         result = matcher->group(3, &destText, length, status);
2062         REGEX_CHECK_STATUS;
2063         REGEX_ASSERT(result == &destText);
2064         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2065         REGEX_ASSERT(length == 2);
2066         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2067         utext_close(result);
2068
2069         // Capture Group number out of range.
2070         status = U_ZERO_ERROR;
2071         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2072         status = U_ZERO_ERROR;
2073         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2074         status = U_ZERO_ERROR;
2075         matcher->reset();
2076         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2077
2078         delete matcher;
2079         delete pat;
2080
2081         utext_close(&destText);
2082         utext_close(&input);
2083         utext_close(&re);
2084     }
2085
2086     //
2087     //  find
2088     //
2089     {
2090         int32_t             flags=0;
2091         UParseError         pe;
2092         UErrorCode          status=U_ZERO_ERROR;
2093         UText               re=UTEXT_INITIALIZER;
2094         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2095         utext_openUTF8(&re, str_abc, -1, &status);
2096
2097         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2098         REGEX_CHECK_STATUS;
2099         UText input = UTEXT_INITIALIZER;
2100         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2101         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2102         //                      012345678901234567
2103
2104         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2105         REGEX_CHECK_STATUS;
2106         REGEX_ASSERT(matcher->find());
2107         REGEX_ASSERT(matcher->start(status) == 1);
2108         REGEX_ASSERT(matcher->find());
2109         REGEX_ASSERT(matcher->start(status) == 6);
2110         REGEX_ASSERT(matcher->find());
2111         REGEX_ASSERT(matcher->start(status) == 12);
2112         REGEX_ASSERT(matcher->find() == FALSE);
2113         REGEX_ASSERT(matcher->find() == FALSE);
2114
2115         matcher->reset();
2116         REGEX_ASSERT(matcher->find());
2117         REGEX_ASSERT(matcher->start(status) == 1);
2118
2119         REGEX_ASSERT(matcher->find(0, status));
2120         REGEX_ASSERT(matcher->start(status) == 1);
2121         REGEX_ASSERT(matcher->find(1, status));
2122         REGEX_ASSERT(matcher->start(status) == 1);
2123         REGEX_ASSERT(matcher->find(2, status));
2124         REGEX_ASSERT(matcher->start(status) == 6);
2125         REGEX_ASSERT(matcher->find(12, status));
2126         REGEX_ASSERT(matcher->start(status) == 12);
2127         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2128         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2129         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2130         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2131
2132         status = U_ZERO_ERROR;
2133         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134         status = U_ZERO_ERROR;
2135         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136
2137         REGEX_ASSERT(matcher->groupCount() == 0);
2138
2139         delete matcher;
2140         delete pat;
2141
2142         utext_close(&input);
2143         utext_close(&re);
2144     }
2145
2146
2147     //
2148     //  find, with \G in pattern (true if at the end of a previous match).
2149     //
2150     {
2151         int32_t             flags=0;
2152         UParseError         pe;
2153         UErrorCode          status=U_ZERO_ERROR;
2154         UText               re=UTEXT_INITIALIZER;
2155         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2156         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2157
2158         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2159
2160         REGEX_CHECK_STATUS;
2161         UText input = UTEXT_INITIALIZER;
2162         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2163         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164         //                      012345678901234567
2165
2166         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167         REGEX_CHECK_STATUS;
2168         REGEX_ASSERT(matcher->find());
2169         REGEX_ASSERT(matcher->start(status) == 0);
2170         REGEX_ASSERT(matcher->start(1, status) == -1);
2171         REGEX_ASSERT(matcher->start(2, status) == 1);
2172
2173         REGEX_ASSERT(matcher->find());
2174         REGEX_ASSERT(matcher->start(status) == 4);
2175         REGEX_ASSERT(matcher->start(1, status) == 4);
2176         REGEX_ASSERT(matcher->start(2, status) == -1);
2177         REGEX_CHECK_STATUS;
2178
2179         delete matcher;
2180         delete pat;
2181
2182         utext_close(&input);
2183         utext_close(&re);
2184     }
2185
2186     //
2187     //   find with zero length matches, match position should bump ahead
2188     //     to prevent loops.
2189     //
2190     {
2191         int32_t                 i;
2192         UErrorCode          status=U_ZERO_ERROR;
2193         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2194                                                       //   using an always-true look-ahead.
2195         REGEX_CHECK_STATUS;
2196         UText s = UTEXT_INITIALIZER;
2197         utext_openUTF8(&s, "    ", -1, &status);
2198         m.reset(&s);
2199         for (i=0; ; i++) {
2200             if (m.find() == FALSE) {
2201                 break;
2202             }
2203             REGEX_ASSERT(m.start(status) == i);
2204             REGEX_ASSERT(m.end(status) == i);
2205         }
2206         REGEX_ASSERT(i==5);
2207
2208         // Check that the bump goes over characters outside the BMP OK
2209         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2210         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2211         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2212         m.reset(&s);
2213         for (i=0; ; i+=4) {
2214             if (m.find() == FALSE) {
2215                 break;
2216             }
2217             REGEX_ASSERT(m.start(status) == i);
2218             REGEX_ASSERT(m.end(status) == i);
2219         }
2220         REGEX_ASSERT(i==20);
2221
2222         utext_close(&s);
2223     }
2224     {
2225         // find() loop breaking test.
2226         //        with pattern of /.?/, should see a series of one char matches, then a single
2227         //        match of zero length at the end of the input string.
2228         int32_t                 i;
2229         UErrorCode          status=U_ZERO_ERROR;
2230         RegexMatcher        m(".?", 0, status);
2231         REGEX_CHECK_STATUS;
2232         UText s = UTEXT_INITIALIZER;
2233         utext_openUTF8(&s, "    ", -1, &status);
2234         m.reset(&s);
2235         for (i=0; ; i++) {
2236             if (m.find() == FALSE) {
2237                 break;
2238             }
2239             REGEX_ASSERT(m.start(status) == i);
2240             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2241         }
2242         REGEX_ASSERT(i==5);
2243
2244         utext_close(&s);
2245     }
2246
2247
2248     //
2249     // Matchers with no input string behave as if they had an empty input string.
2250     //
2251
2252     {
2253         UErrorCode status = U_ZERO_ERROR;
2254         RegexMatcher  m(".?", 0, status);
2255         REGEX_CHECK_STATUS;
2256         REGEX_ASSERT(m.find());
2257         REGEX_ASSERT(m.start(status) == 0);
2258         REGEX_ASSERT(m.input() == "");
2259     }
2260     {
2261         UErrorCode status = U_ZERO_ERROR;
2262         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2263         RegexMatcher  *m = p->matcher(status);
2264         REGEX_CHECK_STATUS;
2265
2266         REGEX_ASSERT(m->find() == FALSE);
2267         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2268         delete m;
2269         delete p;
2270     }
2271
2272     //
2273     // Regions
2274     //
2275     {
2276         UErrorCode status = U_ZERO_ERROR;
2277         UText testPattern = UTEXT_INITIALIZER;
2278         UText testText    = UTEXT_INITIALIZER;
2279         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2280         REGEX_VERBOSE_TEXT(&testPattern);
2281         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2282         REGEX_VERBOSE_TEXT(&testText);
2283
2284         RegexMatcher m(&testPattern, &testText, 0, status);
2285         REGEX_CHECK_STATUS;
2286         REGEX_ASSERT(m.regionStart() == 0);
2287         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2288         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2289         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2290
2291         m.region(2,4, status);
2292         REGEX_CHECK_STATUS;
2293         REGEX_ASSERT(m.matches(status));
2294         REGEX_ASSERT(m.start(status)==2);
2295         REGEX_ASSERT(m.end(status)==4);
2296         REGEX_CHECK_STATUS;
2297
2298         m.reset();
2299         REGEX_ASSERT(m.regionStart() == 0);
2300         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2301
2302         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2303         REGEX_VERBOSE_TEXT(&testText);
2304         m.reset(&testText);
2305         REGEX_ASSERT(m.regionStart() == 0);
2306         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2307
2308         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2309         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2310         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2311         REGEX_ASSERT(&m == &m.reset());
2312         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2313
2314         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2315         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2316         REGEX_ASSERT(&m == &m.reset());
2317         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2318
2319         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2320         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2321         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2322         REGEX_ASSERT(&m == &m.reset());
2323         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2324
2325         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2326         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2327         REGEX_ASSERT(&m == &m.reset());
2328         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2329
2330         utext_close(&testText);
2331         utext_close(&testPattern);
2332     }
2333
2334     //
2335     // hitEnd() and requireEnd()
2336     //
2337     {
2338         UErrorCode status = U_ZERO_ERROR;
2339         UText testPattern = UTEXT_INITIALIZER;
2340         UText testText    = UTEXT_INITIALIZER;
2341         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2342         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2343         utext_openUTF8(&testPattern, str_, -1, &status);
2344         utext_openUTF8(&testText, str_aabb, -1, &status);
2345
2346         RegexMatcher m1(&testPattern, &testText,  0, status);
2347         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2348         REGEX_ASSERT(m1.hitEnd() == TRUE);
2349         REGEX_ASSERT(m1.requireEnd() == FALSE);
2350         REGEX_CHECK_STATUS;
2351
2352         status = U_ZERO_ERROR;
2353         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2354         utext_openUTF8(&testPattern, str_a, -1, &status);
2355         RegexMatcher m2(&testPattern, &testText, 0, status);
2356         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2357         REGEX_ASSERT(m2.hitEnd() == FALSE);
2358         REGEX_ASSERT(m2.requireEnd() == FALSE);
2359         REGEX_CHECK_STATUS;
2360
2361         status = U_ZERO_ERROR;
2362         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2363         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2364         RegexMatcher m3(&testPattern, &testText, 0, status);
2365         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2366         REGEX_ASSERT(m3.hitEnd() == TRUE);
2367         REGEX_ASSERT(m3.requireEnd() == TRUE);
2368         REGEX_CHECK_STATUS;
2369
2370         utext_close(&testText);
2371         utext_close(&testPattern);
2372     }
2373 }
2374
2375
2376 //---------------------------------------------------------------------------
2377 //
2378 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2379 //                         Replace family of functions.
2380 //
2381 //---------------------------------------------------------------------------
2382 void RegexTest::API_Replace_UTF8() {
2383     //
2384     //  Replace
2385     //
2386     int32_t             flags=0;
2387     UParseError         pe;
2388     UErrorCode          status=U_ZERO_ERROR;
2389
2390     UText               re=UTEXT_INITIALIZER;
2391     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2392     REGEX_VERBOSE_TEXT(&re);
2393     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2394     REGEX_CHECK_STATUS;
2395
2396     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2397     //             012345678901234567
2398     UText dataText = UTEXT_INITIALIZER;
2399     utext_openUTF8(&dataText, data, -1, &status);
2400     REGEX_CHECK_STATUS;
2401     REGEX_VERBOSE_TEXT(&dataText);
2402     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2403
2404     //
2405     //  Plain vanilla matches.
2406     //
2407     UnicodeString  dest;
2408     UText destText = UTEXT_INITIALIZER;
2409     utext_openUnicodeString(&destText, &dest, &status);
2410     UText *result;
2411
2412     UText replText = UTEXT_INITIALIZER;
2413
2414     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2415     utext_openUTF8(&replText, str_yz, -1, &status);
2416     REGEX_VERBOSE_TEXT(&replText);
2417     result = matcher->replaceFirst(&replText, NULL, status);
2418     REGEX_CHECK_STATUS;
2419     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2420     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2421     utext_close(result);
2422     result = matcher->replaceFirst(&replText, &destText, status);
2423     REGEX_CHECK_STATUS;
2424     REGEX_ASSERT(result == &destText);
2425     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426
2427     result = matcher->replaceAll(&replText, NULL, status);
2428     REGEX_CHECK_STATUS;
2429     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2430     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2431     utext_close(result);
2432
2433     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2434     result = matcher->replaceAll(&replText, &destText, status);
2435     REGEX_CHECK_STATUS;
2436     REGEX_ASSERT(result == &destText);
2437     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2438
2439     //
2440     //  Plain vanilla non-matches.
2441     //
2442     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2443     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2444     matcher->reset(&dataText);
2445
2446     result = matcher->replaceFirst(&replText, NULL, status);
2447     REGEX_CHECK_STATUS;
2448     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2449     utext_close(result);
2450     result = matcher->replaceFirst(&replText, &destText, status);
2451     REGEX_CHECK_STATUS;
2452     REGEX_ASSERT(result == &destText);
2453     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454
2455     result = matcher->replaceAll(&replText, NULL, status);
2456     REGEX_CHECK_STATUS;
2457     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2458     utext_close(result);
2459     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2460     result = matcher->replaceAll(&replText, &destText, status);
2461     REGEX_CHECK_STATUS;
2462     REGEX_ASSERT(result == &destText);
2463     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2464
2465     //
2466     // Empty source string
2467     //
2468     utext_openUTF8(&dataText, NULL, 0, &status);
2469     matcher->reset(&dataText);
2470
2471     result = matcher->replaceFirst(&replText, NULL, status);
2472     REGEX_CHECK_STATUS;
2473     REGEX_ASSERT_UTEXT_UTF8("", result);
2474     utext_close(result);
2475     result = matcher->replaceFirst(&replText, &destText, status);
2476     REGEX_CHECK_STATUS;
2477     REGEX_ASSERT(result == &destText);
2478     REGEX_ASSERT_UTEXT_UTF8("", result);
2479
2480     result = matcher->replaceAll(&replText, NULL, status);
2481     REGEX_CHECK_STATUS;
2482     REGEX_ASSERT_UTEXT_UTF8("", result);
2483     utext_close(result);
2484     result = matcher->replaceAll(&replText, &destText, status);
2485     REGEX_CHECK_STATUS;
2486     REGEX_ASSERT(result == &destText);
2487     REGEX_ASSERT_UTEXT_UTF8("", result);
2488
2489     //
2490     // Empty substitution string
2491     //
2492     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2493     matcher->reset(&dataText);
2494
2495     utext_openUTF8(&replText, NULL, 0, &status);
2496     result = matcher->replaceFirst(&replText, NULL, status);
2497     REGEX_CHECK_STATUS;
2498     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2499     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2500     utext_close(result);
2501     result = matcher->replaceFirst(&replText, &destText, status);
2502     REGEX_CHECK_STATUS;
2503     REGEX_ASSERT(result == &destText);
2504     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505
2506     result = matcher->replaceAll(&replText, NULL, status);
2507     REGEX_CHECK_STATUS;
2508     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2509     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2510     utext_close(result);
2511     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2512     result = matcher->replaceAll(&replText, &destText, status);
2513     REGEX_CHECK_STATUS;
2514     REGEX_ASSERT(result == &destText);
2515     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2516
2517     //
2518     // match whole string
2519     //
2520     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2521     utext_openUTF8(&dataText, str_abc, -1, &status);
2522     matcher->reset(&dataText);
2523
2524     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2525     utext_openUTF8(&replText, str_xyz, -1, &status);
2526     result = matcher->replaceFirst(&replText, NULL, status);
2527     REGEX_CHECK_STATUS;
2528     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2529     utext_close(result);
2530     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2531     result = matcher->replaceFirst(&replText, &destText, status);
2532     REGEX_CHECK_STATUS;
2533     REGEX_ASSERT(result == &destText);
2534     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2535
2536     result = matcher->replaceAll(&replText, NULL, status);
2537     REGEX_CHECK_STATUS;
2538     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2539     utext_close(result);
2540     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2541     result = matcher->replaceAll(&replText, &destText, status);
2542     REGEX_CHECK_STATUS;
2543     REGEX_ASSERT(result == &destText);
2544     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2545
2546     //
2547     // Capture Group, simple case
2548     //
2549     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2550     utext_openUTF8(&re, str_add, -1, &status);
2551     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2552     REGEX_CHECK_STATUS;
2553
2554     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2555     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2556     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2557     REGEX_CHECK_STATUS;
2558
2559     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2560     utext_openUTF8(&replText, str_11, -1, &status);
2561     result = matcher2->replaceFirst(&replText, NULL, status);
2562     REGEX_CHECK_STATUS;
2563     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2564     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2565     utext_close(result);
2566     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567     result = matcher2->replaceFirst(&replText, &destText, status);
2568     REGEX_CHECK_STATUS;
2569     REGEX_ASSERT(result == &destText);
2570     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2571
2572     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2573     utext_openUTF8(&replText, str_v, -1, &status);
2574     REGEX_VERBOSE_TEXT(&replText);
2575     result = matcher2->replaceFirst(&replText, NULL, status);
2576     REGEX_CHECK_STATUS;
2577     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2578     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2579     utext_close(result);
2580     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2581     result = matcher2->replaceFirst(&replText, &destText, status);
2582     REGEX_CHECK_STATUS;
2583     REGEX_ASSERT(result == &destText);
2584     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2585
2586     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2587                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2588                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2589     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2590     result = matcher2->replaceFirst(&replText, NULL, status);
2591     REGEX_CHECK_STATUS;
2592     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2593     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2594     utext_close(result);
2595     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2596     result = matcher2->replaceFirst(&replText, &destText, status);
2597     REGEX_CHECK_STATUS;
2598     REGEX_ASSERT(result == &destText);
2599     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2600
2601     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2602     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2603     //                                 012345678901234567890123456
2604     supplDigitChars[22] = 0xF0;
2605     supplDigitChars[23] = 0x9D;
2606     supplDigitChars[24] = 0x9F;
2607     supplDigitChars[25] = 0x8F;
2608     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2609
2610     result = matcher2->replaceFirst(&replText, NULL, status);
2611     REGEX_CHECK_STATUS;
2612     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2613     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2614     utext_close(result);
2615     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2616     result = matcher2->replaceFirst(&replText, &destText, status);
2617     REGEX_CHECK_STATUS;
2618     REGEX_ASSERT(result == &destText);
2619     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2620     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2621     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2622     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2623 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2624     utext_close(result);
2625     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2626     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2627     REGEX_ASSERT(result == &destText);
2628 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2629
2630     //
2631     // Replacement String with \u hex escapes
2632     //
2633     {
2634       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2635       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2636         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2637         utext_openUTF8(&replText, str_u0043, -1, &status);
2638         matcher->reset(&dataText);
2639
2640         result = matcher->replaceAll(&replText, NULL, status);
2641         REGEX_CHECK_STATUS;
2642         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2643         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2644         utext_close(result);
2645         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2646         result = matcher->replaceAll(&replText, &destText, status);
2647         REGEX_CHECK_STATUS;
2648         REGEX_ASSERT(result == &destText);
2649         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2650     }
2651     {
2652       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2653         utext_openUTF8(&dataText, str_abc, -1, &status);
2654         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2655         utext_openUTF8(&replText, str_U00010000, -1, &status);
2656         matcher->reset(&dataText);
2657
2658         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2659         //                          0123456789
2660         expected[2] = 0xF0;
2661         expected[3] = 0x90;
2662         expected[4] = 0x80;
2663         expected[5] = 0x80;
2664
2665         result = matcher->replaceAll(&replText, NULL, status);
2666         REGEX_CHECK_STATUS;
2667         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2668         utext_close(result);
2669         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2670         result = matcher->replaceAll(&replText, &destText, status);
2671         REGEX_CHECK_STATUS;
2672         REGEX_ASSERT(result == &destText);
2673         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2674     }
2675     // TODO:  need more through testing of capture substitutions.
2676
2677     // Bug 4057
2678     //
2679     {
2680         status = U_ZERO_ERROR;
2681 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2682 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2683 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2684         utext_openUTF8(&re, str_ssee, -1, &status);
2685         utext_openUTF8(&dataText, str_blah, -1, &status);
2686         utext_openUTF8(&replText, str_ooh, -1, &status);
2687
2688         RegexMatcher m(&re, 0, status);
2689         REGEX_CHECK_STATUS;
2690
2691         UnicodeString result;
2692         UText resultText = UTEXT_INITIALIZER;
2693         utext_openUnicodeString(&resultText, &result, &status);
2694
2695         // Multiple finds do NOT bump up the previous appendReplacement postion.
2696         m.reset(&dataText);
2697         m.find();
2698         m.find();
2699         m.appendReplacement(&resultText, &replText, status);
2700         REGEX_CHECK_STATUS;
2701         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2702         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2703
2704         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2705         status = U_ZERO_ERROR;
2706         result.truncate(0);
2707         utext_openUnicodeString(&resultText, &result, &status);
2708         m.reset(10, status);
2709         m.find();
2710         m.find();
2711         m.appendReplacement(&resultText, &replText, status);
2712         REGEX_CHECK_STATUS;
2713         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2714         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2715
2716         // find() at interior of string, appendReplacement still starts at beginning.
2717         status = U_ZERO_ERROR;
2718         result.truncate(0);
2719         utext_openUnicodeString(&resultText, &result, &status);
2720         m.reset();
2721         m.find(10, status);
2722         m.find();
2723         m.appendReplacement(&resultText, &replText, status);
2724         REGEX_CHECK_STATUS;
2725         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2726         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2727
2728         m.appendTail(&resultText, status);
2729         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2730         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2731
2732         utext_close(&resultText);
2733     }
2734
2735     delete matcher2;
2736     delete pat2;
2737     delete matcher;
2738     delete pat;
2739
2740     utext_close(&dataText);
2741     utext_close(&replText);
2742     utext_close(&destText);
2743     utext_close(&re);
2744 }
2745
2746
2747 //---------------------------------------------------------------------------
2748 //
2749 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2750 //                        present and nominally working.
2751 //
2752 //---------------------------------------------------------------------------
2753 void RegexTest::API_Pattern_UTF8() {
2754     RegexPattern        pata;    // Test default constructor to not crash.
2755     RegexPattern        patb;
2756
2757     REGEX_ASSERT(pata == patb);
2758     REGEX_ASSERT(pata == pata);
2759
2760     UText         re1 = UTEXT_INITIALIZER;
2761     UText         re2 = UTEXT_INITIALIZER;
2762     UErrorCode    status = U_ZERO_ERROR;
2763     UParseError   pe;
2764
2765     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2766     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2767     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2768     utext_openUTF8(&re2, str_def, -1, &status);
2769
2770     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2771     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2772     REGEX_CHECK_STATUS;
2773     REGEX_ASSERT(*pat1 == *pat1);
2774     REGEX_ASSERT(*pat1 != pata);
2775
2776     // Assign
2777     patb = *pat1;
2778     REGEX_ASSERT(patb == *pat1);
2779
2780     // Copy Construct
2781     RegexPattern patc(*pat1);
2782     REGEX_ASSERT(patc == *pat1);
2783     REGEX_ASSERT(patb == patc);
2784     REGEX_ASSERT(pat1 != pat2);
2785     patb = *pat2;
2786     REGEX_ASSERT(patb != patc);
2787     REGEX_ASSERT(patb == *pat2);
2788
2789     // Compile with no flags.
2790     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2791     REGEX_ASSERT(*pat1a == *pat1);
2792
2793     REGEX_ASSERT(pat1a->flags() == 0);
2794
2795     // Compile with different flags should be not equal
2796     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2797     REGEX_CHECK_STATUS;
2798
2799     REGEX_ASSERT(*pat1b != *pat1a);
2800     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2801     REGEX_ASSERT(pat1a->flags() == 0);
2802     delete pat1b;
2803
2804     // clone
2805     RegexPattern *pat1c = pat1->clone();
2806     REGEX_ASSERT(*pat1c == *pat1);
2807     REGEX_ASSERT(*pat1c != *pat2);
2808
2809     delete pat1c;
2810     delete pat1a;
2811     delete pat1;
2812     delete pat2;
2813
2814     utext_close(&re1);
2815     utext_close(&re2);
2816
2817
2818     //
2819     //   Verify that a matcher created from a cloned pattern works.
2820     //     (Jitterbug 3423)
2821     //
2822     {
2823         UErrorCode     status     = U_ZERO_ERROR;
2824         UText          pattern    = UTEXT_INITIALIZER;
2825         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2826         utext_openUTF8(&pattern, str_pL, -1, &status);
2827
2828         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2829         RegexPattern  *pClone     = pSource->clone();
2830         delete         pSource;
2831         RegexMatcher  *mFromClone = pClone->matcher(status);
2832         REGEX_CHECK_STATUS;
2833
2834         UText          input      = UTEXT_INITIALIZER;
2835         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2836         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2837         mFromClone->reset(&input);
2838         REGEX_ASSERT(mFromClone->find() == TRUE);
2839         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2840         REGEX_ASSERT(mFromClone->find() == TRUE);
2841         REGEX_ASSERT(mFromClone->group(status) == "World");
2842         REGEX_ASSERT(mFromClone->find() == FALSE);
2843         delete mFromClone;
2844         delete pClone;
2845
2846         utext_close(&input);
2847         utext_close(&pattern);
2848     }
2849
2850     //
2851     //   matches convenience API
2852     //
2853     {
2854         UErrorCode status  = U_ZERO_ERROR;
2855         UText      pattern = UTEXT_INITIALIZER;
2856         UText      input   = UTEXT_INITIALIZER;
2857
2858         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2859         utext_openUTF8(&input, str_randominput, -1, &status);
2860
2861         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2862         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2863         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2864         REGEX_CHECK_STATUS;
2865
2866         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2867         utext_openUTF8(&pattern, str_abc, -1, &status);
2868         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2869         REGEX_CHECK_STATUS;
2870
2871         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2872         utext_openUTF8(&pattern, str_nput, -1, &status);
2873         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2874         REGEX_CHECK_STATUS;
2875
2876         utext_openUTF8(&pattern, str_randominput, -1, &status);
2877         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2878         REGEX_CHECK_STATUS;
2879
2880         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2881         utext_openUTF8(&pattern, str_u, -1, &status);
2882         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2883         REGEX_CHECK_STATUS;
2884
2885         utext_openUTF8(&input, str_abc, -1, &status);
2886         utext_openUTF8(&pattern, str_abc, -1, &status);
2887         status = U_INDEX_OUTOFBOUNDS_ERROR;
2888         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2889         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2890
2891         utext_close(&input);
2892         utext_close(&pattern);
2893     }
2894
2895
2896     //
2897     // Split()
2898     //
2899     status = U_ZERO_ERROR;
2900     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2901     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2902     pat1 = RegexPattern::compile(&re1, pe, status);
2903     REGEX_CHECK_STATUS;
2904     UnicodeString  fields[10];
2905
2906     int32_t n;
2907     n = pat1->split("Now is the time", fields, 10, status);
2908     REGEX_CHECK_STATUS;
2909     REGEX_ASSERT(n==4);
2910     REGEX_ASSERT(fields[0]=="Now");
2911     REGEX_ASSERT(fields[1]=="is");
2912     REGEX_ASSERT(fields[2]=="the");
2913     REGEX_ASSERT(fields[3]=="time");
2914     REGEX_ASSERT(fields[4]=="");
2915
2916     n = pat1->split("Now is the time", fields, 2, status);
2917     REGEX_CHECK_STATUS;
2918     REGEX_ASSERT(n==2);
2919     REGEX_ASSERT(fields[0]=="Now");
2920     REGEX_ASSERT(fields[1]=="is the time");
2921     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2922
2923     fields[1] = "*";
2924     status = U_ZERO_ERROR;
2925     n = pat1->split("Now is the time", fields, 1, status);
2926     REGEX_CHECK_STATUS;
2927     REGEX_ASSERT(n==1);
2928     REGEX_ASSERT(fields[0]=="Now is the time");
2929     REGEX_ASSERT(fields[1]=="*");
2930     status = U_ZERO_ERROR;
2931
2932     n = pat1->split("    Now       is the time   ", fields, 10, status);
2933     REGEX_CHECK_STATUS;
2934     REGEX_ASSERT(n==6);
2935     REGEX_ASSERT(fields[0]=="");
2936     REGEX_ASSERT(fields[1]=="Now");
2937     REGEX_ASSERT(fields[2]=="is");
2938     REGEX_ASSERT(fields[3]=="the");
2939     REGEX_ASSERT(fields[4]=="time");
2940     REGEX_ASSERT(fields[5]=="");
2941     REGEX_ASSERT(fields[6]=="");
2942
2943     fields[2] = "*";
2944     n = pat1->split("     ", fields, 10, status);
2945     REGEX_CHECK_STATUS;
2946     REGEX_ASSERT(n==2);
2947     REGEX_ASSERT(fields[0]=="");
2948     REGEX_ASSERT(fields[1]=="");
2949     REGEX_ASSERT(fields[2]=="*");
2950
2951     fields[0] = "foo";
2952     n = pat1->split("", fields, 10, status);
2953     REGEX_CHECK_STATUS;
2954     REGEX_ASSERT(n==0);
2955     REGEX_ASSERT(fields[0]=="foo");
2956
2957     delete pat1;
2958
2959     //  split, with a pattern with (capture)
2960     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2961     pat1 = RegexPattern::compile(&re1,  pe, status);
2962     REGEX_CHECK_STATUS;
2963
2964     status = U_ZERO_ERROR;
2965     fields[6] = fields[7] = "*";
2966     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2967     REGEX_CHECK_STATUS;
2968     REGEX_ASSERT(n==7);
2969     REGEX_ASSERT(fields[0]=="");
2970     REGEX_ASSERT(fields[1]=="a");
2971     REGEX_ASSERT(fields[2]=="Now is ");
2972     REGEX_ASSERT(fields[3]=="b");
2973     REGEX_ASSERT(fields[4]=="the time");
2974     REGEX_ASSERT(fields[5]=="c");
2975     REGEX_ASSERT(fields[6]=="");
2976     REGEX_ASSERT(fields[7]=="*");
2977     REGEX_ASSERT(status==U_ZERO_ERROR);
2978
2979     fields[6] = fields[7] = "*";
2980     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2981     REGEX_CHECK_STATUS;
2982     REGEX_ASSERT(n==7);
2983     REGEX_ASSERT(fields[0]=="  ");
2984     REGEX_ASSERT(fields[1]=="a");
2985     REGEX_ASSERT(fields[2]=="Now is ");
2986     REGEX_ASSERT(fields[3]=="b");
2987     REGEX_ASSERT(fields[4]=="the time");
2988     REGEX_ASSERT(fields[5]=="c");
2989     REGEX_ASSERT(fields[6]=="");
2990     REGEX_ASSERT(fields[7]=="*");
2991
2992     status = U_ZERO_ERROR;
2993     fields[6] = "foo";
2994     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2995     REGEX_CHECK_STATUS;
2996     REGEX_ASSERT(n==6);
2997     REGEX_ASSERT(fields[0]=="  ");
2998     REGEX_ASSERT(fields[1]=="a");
2999     REGEX_ASSERT(fields[2]=="Now is ");
3000     REGEX_ASSERT(fields[3]=="b");
3001     REGEX_ASSERT(fields[4]=="the time");
3002     REGEX_ASSERT(fields[5]==" ");
3003     REGEX_ASSERT(fields[6]=="foo");
3004
3005     status = U_ZERO_ERROR;
3006     fields[5] = "foo";
3007     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3008     REGEX_CHECK_STATUS;
3009     REGEX_ASSERT(n==5);
3010     REGEX_ASSERT(fields[0]=="  ");
3011     REGEX_ASSERT(fields[1]=="a");
3012     REGEX_ASSERT(fields[2]=="Now is ");
3013     REGEX_ASSERT(fields[3]=="b");
3014     REGEX_ASSERT(fields[4]=="the time<c>");
3015     REGEX_ASSERT(fields[5]=="foo");
3016
3017     status = U_ZERO_ERROR;
3018     fields[5] = "foo";
3019     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3020     REGEX_CHECK_STATUS;
3021     REGEX_ASSERT(n==5);
3022     REGEX_ASSERT(fields[0]=="  ");
3023     REGEX_ASSERT(fields[1]=="a");
3024     REGEX_ASSERT(fields[2]=="Now is ");
3025     REGEX_ASSERT(fields[3]=="b");
3026     REGEX_ASSERT(fields[4]=="the time");
3027     REGEX_ASSERT(fields[5]=="foo");
3028
3029     status = U_ZERO_ERROR;
3030     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3031     REGEX_CHECK_STATUS;
3032     REGEX_ASSERT(n==4);
3033     REGEX_ASSERT(fields[0]=="  ");
3034     REGEX_ASSERT(fields[1]=="a");
3035     REGEX_ASSERT(fields[2]=="Now is ");
3036     REGEX_ASSERT(fields[3]=="the time<c>");
3037     status = U_ZERO_ERROR;
3038     delete pat1;
3039
3040     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3041     pat1 = RegexPattern::compile(&re1, pe, status);
3042     REGEX_CHECK_STATUS;
3043     n = pat1->split("1-10,20", fields, 10, status);
3044     REGEX_CHECK_STATUS;
3045     REGEX_ASSERT(n==5);
3046     REGEX_ASSERT(fields[0]=="1");
3047     REGEX_ASSERT(fields[1]=="-");
3048     REGEX_ASSERT(fields[2]=="10");
3049     REGEX_ASSERT(fields[3]==",");
3050     REGEX_ASSERT(fields[4]=="20");
3051     delete pat1;
3052
3053
3054     //
3055     // split of a UText based string, with library allocating output UTexts.
3056     //
3057     {
3058         status = U_ZERO_ERROR;
3059         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3060         UnicodeString stringToSplit("first:second:third");
3061         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3062         REGEX_CHECK_STATUS;
3063
3064         UText *splits[10] = {NULL};
3065         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3066         REGEX_CHECK_STATUS;
3067         REGEX_ASSERT(numFields == 5);
3068         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3069         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3070         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3071         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3072         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3073         REGEX_ASSERT(splits[5] == NULL);
3074
3075         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3076             if (splits[i]) {
3077                 utext_close(splits[i]);
3078                 splits[i] = NULL;
3079             }
3080         }
3081         utext_close(textToSplit);
3082     }
3083
3084
3085     //
3086     // RegexPattern::pattern() and patternText()
3087     //
3088     pat1 = new RegexPattern();
3089     REGEX_ASSERT(pat1->pattern() == "");
3090     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3091     delete pat1;
3092     const char *helloWorldInvariant = "(Hello, world)*";
3093     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3094     pat1 = RegexPattern::compile(&re1, pe, status);
3095     REGEX_CHECK_STATUS;
3096     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3097     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3098     delete pat1;
3099
3100     utext_close(&re1);
3101 }
3102
3103
3104 //---------------------------------------------------------------------------
3105 //
3106 //      Extended       A more thorough check for features of regex patterns
3107 //                     The test cases are in a separate data file,
3108 //                       source/tests/testdata/regextst.txt
3109 //                     A description of the test data format is included in that file.
3110 //
3111 //---------------------------------------------------------------------------
3112
3113 const char *
3114 RegexTest::getPath(char buffer[2048], const char *filename) {
3115     UErrorCode status=U_ZERO_ERROR;
3116     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3117     if (U_FAILURE(status)) {
3118         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3119         return NULL;
3120     }
3121
3122     strcpy(buffer, testDataDirectory);
3123     strcat(buffer, filename);
3124     return buffer;
3125 }
3126
3127 void RegexTest::Extended() {
3128     char tdd[2048];
3129     const char *srcPath;
3130     UErrorCode  status  = U_ZERO_ERROR;
3131     int32_t     lineNum = 0;
3132
3133     //
3134     //  Open and read the test data file.
3135     //
3136     srcPath=getPath(tdd, "regextst.txt");
3137     if(srcPath==NULL) {
3138         return; /* something went wrong, error already output */
3139     }
3140
3141     int32_t    len;
3142     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3143     if (U_FAILURE(status)) {
3144         return; /* something went wrong, error already output */
3145     }
3146
3147     //
3148     //  Put the test data into a UnicodeString
3149     //
3150     UnicodeString testString(FALSE, testData, len);
3151
3152     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3153     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3154     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3155
3156     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3157     UnicodeString   testPattern;   // The pattern for test from the test file.
3158     UnicodeString   testFlags;     // the flags   for a test.
3159     UnicodeString   matchString;   // The marked up string to be used as input
3160
3161     if (U_FAILURE(status)){
3162         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3163         delete [] testData;
3164         return;
3165     }
3166
3167     //
3168     //  Loop over the test data file, once per line.
3169     //
3170     while (lineMat.find()) {
3171         lineNum++;
3172         if (U_FAILURE(status)) {
3173           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3174         }
3175
3176         status = U_ZERO_ERROR;
3177         UnicodeString testLine = lineMat.group(1, status);
3178         if (testLine.length() == 0) {
3179             continue;
3180         }
3181
3182         //
3183         // Parse the test line.  Skip blank and comment only lines.
3184         // Separate out the three main fields - pattern, flags, target.
3185         //
3186
3187         commentMat.reset(testLine);
3188         if (commentMat.lookingAt(status)) {
3189             // This line is a comment, or blank.
3190             continue;
3191         }
3192
3193         //
3194         //  Pull out the pattern field, remove it from the test file line.
3195         //
3196         quotedStuffMat.reset(testLine);
3197         if (quotedStuffMat.lookingAt(status)) {
3198             testPattern = quotedStuffMat.group(2, status);
3199             testLine.remove(0, quotedStuffMat.end(0, status));
3200         } else {
3201             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3202             continue;
3203         }
3204
3205
3206         //
3207         //  Pull out the flags from the test file line.
3208         //
3209         flagsMat.reset(testLine);
3210         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3211         testFlags = flagsMat.group(1, status);
3212         if (flagsMat.group(2, status).length() > 0) {
3213             errln("Bad Match flag at line %d. Scanning %c\n",
3214                 lineNum, flagsMat.group(2, status).charAt(0));
3215             continue;
3216         }
3217         testLine.remove(0, flagsMat.end(0, status));
3218
3219         //
3220         //  Pull out the match string, as a whole.
3221         //    We'll process the <tags> later.
3222         //
3223         quotedStuffMat.reset(testLine);
3224         if (quotedStuffMat.lookingAt(status)) {
3225             matchString = quotedStuffMat.group(2, status);
3226             testLine.remove(0, quotedStuffMat.end(0, status));
3227         } else {
3228             errln("Bad match string at test file line %d", lineNum);
3229             continue;
3230         }
3231
3232         //
3233         //  The only thing left from the input line should be an optional trailing comment.
3234         //
3235         commentMat.reset(testLine);
3236         if (commentMat.lookingAt(status) == FALSE) {
3237             errln("Line %d: unexpected characters at end of test line.", lineNum);
3238             continue;
3239         }
3240
3241         //
3242         //  Run the test
3243         //
3244         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3245     }
3246
3247     delete [] testData;
3248
3249 }
3250
3251
3252
3253 //---------------------------------------------------------------------------
3254 //
3255 //    regex_find(pattern, flags, inputString, lineNumber)
3256 //
3257 //         Function to run a single test from the Extended (data driven) tests.
3258 //         See file test/testdata/regextst.txt for a description of the
3259 //         pattern and inputString fields, and the allowed flags.
3260 //         lineNumber is the source line in regextst.txt of the test.
3261 //
3262 //---------------------------------------------------------------------------
3263
3264
3265 //  Set a value into a UVector at position specified by a decimal number in
3266 //   a UnicodeString.   This is a utility function needed by the actual test function,
3267 //   which follows.
3268 static void set(UVector &vec, int32_t val, UnicodeString index) {
3269     UErrorCode  status=U_ZERO_ERROR;
3270     int32_t  idx = 0;
3271     for (int32_t i=0; i<index.length(); i++) {
3272         int32_t d=u_charDigitValue(index.charAt(i));
3273         if (d<0) {return;}
3274         idx = idx*10 + d;
3275     }
3276     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3277     vec.setElementAt(val, idx);
3278 }
3279
3280 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3281     UErrorCode  status=U_ZERO_ERROR;
3282     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3283     vec.setElementAt(val, idx);
3284 }
3285
3286 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3287 {
3288     UBool couldFind = TRUE;
3289     UTEXT_SETNATIVEINDEX(utext, 0);
3290     int32_t i = 0;
3291     while (i < unistrOffset) {
3292         UChar32 c = UTEXT_NEXT32(utext);
3293         if (c != U_SENTINEL) {
3294             i += U16_LENGTH(c);
3295         } else {
3296             couldFind = FALSE;
3297             break;
3298         }
3299     }
3300     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3301     return couldFind;
3302 }
3303
3304
3305 void RegexTest::regex_find(const UnicodeString &pattern,
3306                            const UnicodeString &flags,
3307                            const UnicodeString &inputString,
3308                            const char *srcPath,
3309                            int32_t line) {
3310     UnicodeString       unEscapedInput;
3311     UnicodeString       deTaggedInput;
3312
3313     int32_t             patternUTF8Length,      inputUTF8Length;
3314     char                *patternChars  = NULL, *inputChars = NULL;
3315     UText               patternText    = UTEXT_INITIALIZER;
3316     UText               inputText      = UTEXT_INITIALIZER;
3317     UConverter          *UTF8Converter = NULL;
3318
3319     UErrorCode          status         = U_ZERO_ERROR;
3320     UParseError         pe;
3321     RegexPattern        *parsePat      = NULL;
3322     RegexMatcher        *parseMatcher  = NULL;
3323     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3324     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3325     UVector             groupStarts(status);
3326     UVector             groupEnds(status);
3327     UVector             groupStartsUTF8(status);
3328     UVector             groupEndsUTF8(status);
3329     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3330     UBool               failed         = FALSE;
3331     int32_t             numFinds;
3332     int32_t             i;
3333     UBool               useMatchesFunc   = FALSE;
3334     UBool               useLookingAtFunc = FALSE;
3335     int32_t             regionStart      = -1;
3336     int32_t             regionEnd        = -1;
3337     int32_t             regionStartUTF8  = -1;
3338     int32_t             regionEndUTF8    = -1;
3339
3340
3341     //
3342     //  Compile the caller's pattern
3343     //
3344     uint32_t bflags = 0;
3345     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3346         bflags |= UREGEX_CASE_INSENSITIVE;
3347     }
3348     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3349         bflags |= UREGEX_COMMENTS;
3350     }
3351     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3352         bflags |= UREGEX_DOTALL;
3353     }
3354     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3355         bflags |= UREGEX_MULTILINE;
3356     }
3357
3358     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3359         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3360     }
3361     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3362         bflags |= UREGEX_UNIX_LINES;
3363     }
3364     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3365         bflags |= UREGEX_LITERAL;
3366     }
3367
3368
3369     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3370     if (status != U_ZERO_ERROR) {
3371         #if UCONFIG_NO_BREAK_ITERATION==1
3372         // 'v' test flag means that the test pattern should not compile if ICU was configured
3373         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3374         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3375             goto cleanupAndReturn;
3376         }
3377         #endif
3378         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3379             // Expected pattern compilation error.
3380             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3381                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3382             }
3383             goto cleanupAndReturn;
3384         } else {
3385             // Unexpected pattern compilation error.
3386             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3387             goto cleanupAndReturn;
3388         }
3389     }
3390
3391     UTF8Converter = ucnv_open("UTF8", &status);
3392     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3393
3394     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3395     status = U_ZERO_ERROR; // buffer overflow
3396     patternChars = new char[patternUTF8Length+1];
3397     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3398     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3399
3400     if (status == U_ZERO_ERROR) {
3401         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3402
3403         if (status != U_ZERO_ERROR) {
3404 #if UCONFIG_NO_BREAK_ITERATION==1
3405             // 'v' test flag means that the test pattern should not compile if ICU was configured
3406             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3407             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3408                 goto cleanupAndReturn;
3409             }
3410 #endif
3411             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3412                 // Expected pattern compilation error.
3413                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3414                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3415                 }
3416                 goto cleanupAndReturn;
3417             } else {
3418                 // Unexpected pattern compilation error.
3419                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3420                 goto cleanupAndReturn;
3421             }
3422         }
3423     }
3424
3425     if (UTF8Pattern == NULL) {
3426         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3427         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3428         status = U_ZERO_ERROR;
3429     }
3430
3431     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3432         callerPattern->dumpPattern();
3433     }
3434
3435     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3436         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3437         goto cleanupAndReturn;
3438     }
3439
3440
3441     //
3442     // Number of times find() should be called on the test string, default to 1
3443     //
3444     numFinds = 1;
3445     for (i=2; i<=9; i++) {
3446         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3447             if (numFinds != 1) {
3448                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3449                 goto cleanupAndReturn;
3450             }
3451             numFinds = i;
3452         }
3453     }
3454
3455     // 'M' flag.  Use matches() instead of find()
3456     if (flags.indexOf((UChar)0x4d) >= 0) {
3457         useMatchesFunc = TRUE;
3458     }
3459     if (flags.indexOf((UChar)0x4c) >= 0) {
3460         useLookingAtFunc = TRUE;
3461     }
3462
3463     //
3464     //  Find the tags in the input data, remove them, and record the group boundary
3465     //    positions.
3466     //
3467     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3468     REGEX_CHECK_STATUS_L(line);
3469
3470     unEscapedInput = inputString.unescape();
3471     parseMatcher = parsePat->matcher(unEscapedInput, status);
3472     REGEX_CHECK_STATUS_L(line);
3473     while(parseMatcher->find()) {
3474         parseMatcher->appendReplacement(deTaggedInput, "", status);
3475         REGEX_CHECK_STATUS;
3476         UnicodeString groupNum = parseMatcher->group(2, status);
3477         if (groupNum == "r") {
3478             // <r> or </r>, a region specification within the string
3479             if (parseMatcher->group(1, status) == "/") {
3480                 regionEnd = deTaggedInput.length();
3481             } else {
3482                 regionStart = deTaggedInput.length();
3483             }
3484         } else {
3485             // <digits> or </digits>, a group match boundary tag.
3486             if (parseMatcher->group(1, status) == "/") {
3487                 set(groupEnds, deTaggedInput.length(), groupNum);
3488             } else {
3489                 set(groupStarts, deTaggedInput.length(), groupNum);
3490             }
3491         }
3492     }
3493     parseMatcher->appendTail(deTaggedInput);
3494     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3495     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3496       errln("mismatched <r> tags");
3497       failed = TRUE;
3498       goto cleanupAndReturn;
3499     }
3500
3501     //
3502     //  Configure the matcher according to the flags specified with this test.
3503     //
3504     matcher = callerPattern->matcher(deTaggedInput, status);
3505     REGEX_CHECK_STATUS_L(line);
3506     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3507         matcher->setTrace(TRUE);
3508     }
3509
3510     if (UTF8Pattern != NULL) {
3511         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3512         status = U_ZERO_ERROR; // buffer overflow
3513         inputChars = new char[inputUTF8Length+1];
3514         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3515         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3516
3517         if (status == U_ZERO_ERROR) {
3518             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3519             REGEX_CHECK_STATUS_L(line);
3520         }
3521
3522         if (UTF8Matcher == NULL) {
3523             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3524             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3525             status = U_ZERO_ERROR;
3526         }
3527     }
3528
3529     //
3530     //  Generate native indices for UTF8 versions of region and capture group info
3531     //
3532     if (UTF8Matcher != NULL) {
3533         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3534             UTF8Matcher->setTrace(TRUE);
3535         }
3536         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3537         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3538
3539         //  Fill out the native index UVector info.
3540         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3541         for (i=0; i<groupStarts.size(); i++) {
3542             int32_t  start = groupStarts.elementAti(i);
3543             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3544             if (start >= 0) {
3545                 int32_t  startUTF8;
3546                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3547                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3548                     failed = TRUE;
3549                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3550                 }
3551                 setInt(groupStartsUTF8, startUTF8, i);
3552             }
3553
3554             int32_t  end = groupEnds.elementAti(i);
3555             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3556             if (end >= 0) {
3557                 int32_t  endUTF8;
3558                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3559                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3560                     failed = TRUE;
3561                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3562                 }
3563                 setInt(groupEndsUTF8, endUTF8, i);
3564             }
3565         }
3566     }
3567
3568     if (regionStart>=0) {
3569        matcher->region(regionStart, regionEnd, status);
3570        REGEX_CHECK_STATUS_L(line);
3571        if (UTF8Matcher != NULL) {
3572            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3573            REGEX_CHECK_STATUS_L(line);
3574        }
3575     }
3576     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3577         matcher->useAnchoringBounds(FALSE);
3578         if (UTF8Matcher != NULL) {
3579             UTF8Matcher->useAnchoringBounds(FALSE);
3580         }
3581     }
3582     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3583         matcher->useTransparentBounds(TRUE);
3584         if (UTF8Matcher != NULL) {
3585             UTF8Matcher->useTransparentBounds(TRUE);
3586         }
3587     }
3588
3589
3590
3591     //
3592     // Do a find on the de-tagged input using the caller's pattern
3593     //     TODO: error on count>1 and not find().
3594     //           error on both matches() and lookingAt().
3595     //
3596     for (i=0; i<numFinds; i++) {
3597         if (useMatchesFunc) {
3598             isMatch = matcher->matches(status);
3599             if (UTF8Matcher != NULL) {
3600                isUTF8Match = UTF8Matcher->matches(status);
3601             }
3602         } else  if (useLookingAtFunc) {
3603             isMatch = matcher->lookingAt(status);
3604             if (UTF8Matcher != NULL) {
3605                 isUTF8Match = UTF8Matcher->lookingAt(status);
3606             }
3607         } else {
3608             isMatch = matcher->find();
3609             if (UTF8Matcher != NULL) {
3610                 isUTF8Match = UTF8Matcher->find();
3611             }
3612         }
3613     }
3614     matcher->setTrace(FALSE);
3615     if (UTF8Matcher) {
3616         UTF8Matcher->setTrace(FALSE);
3617     }
3618     if (U_FAILURE(status)) {
3619         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3620     }
3621
3622     //
3623     // Match up the groups from the find() with the groups from the tags
3624     //
3625
3626     // number of tags should match number of groups from find operation.
3627     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3628     //   G option in test means that capture group data is not available in the
3629     //     expected results, so the check needs to be suppressed.
3630     if (isMatch == FALSE && groupStarts.size() != 0) {
3631         dataerrln("Error at line %d:  Match expected, but none found.", line);
3632         failed = TRUE;
3633         goto cleanupAndReturn;
3634     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3635         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3636         failed = TRUE;
3637         goto cleanupAndReturn;
3638     }
3639     if (isMatch && groupStarts.size() == 0) {
3640         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3641         failed = TRUE;
3642     }
3643     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3644         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3645         failed = TRUE;
3646     }
3647
3648     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3649         // Only check for match / no match.  Don't check capture groups.
3650         goto cleanupAndReturn;
3651     }
3652
3653     REGEX_CHECK_STATUS_L(line);
3654     for (i=0; i<=matcher->groupCount(); i++) {
3655         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3656         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3657         if (matcher->start(i, status) != expectedStart) {
3658             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3659                 line, i, expectedStart, matcher->start(i, status));
3660             failed = TRUE;
3661             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3662         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3663             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3664                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3665             failed = TRUE;
3666             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3667         }
3668
3669         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3670         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3671         if (matcher->end(i, status) != expectedEnd) {
3672             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3673                 line, i, expectedEnd, matcher->end(i, status));
3674             failed = TRUE;
3675             // Error on end position;  keep going; real error is probably yet to come as group
3676             //   end positions work from end of the input data towards the front.
3677         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3678             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3679                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3680             failed = TRUE;
3681             // Error on end position;  keep going; real error is probably yet to come as group
3682             //   end positions work from end of the input data towards the front.
3683         }
3684     }
3685     if ( matcher->groupCount()+1 < groupStarts.size()) {
3686         errln("Error at line %d: Expected %d capture groups, found %d.",
3687             line, groupStarts.size()-1, matcher->groupCount());
3688         failed = TRUE;
3689         }
3690     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3691         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3692               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3693         failed = TRUE;
3694     }
3695
3696     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3697         matcher->requireEnd() == TRUE) {
3698         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3699         failed = TRUE;
3700     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3701         UTF8Matcher->requireEnd() == TRUE) {
3702         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3703         failed = TRUE;
3704     }
3705
3706     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3707         matcher->requireEnd() == FALSE) {
3708         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3709         failed = TRUE;
3710     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3711         UTF8Matcher->requireEnd() == FALSE) {
3712         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3713         failed = TRUE;
3714     }
3715
3716     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3717         matcher->hitEnd() == TRUE) {
3718         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3719         failed = TRUE;
3720     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3721                UTF8Matcher->hitEnd() == TRUE) {
3722         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3723         failed = TRUE;
3724     }
3725
3726     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3727         matcher->hitEnd() == FALSE) {
3728         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3729         failed = TRUE;
3730     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3731                UTF8Matcher->hitEnd() == FALSE) {
3732         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3733         failed = TRUE;
3734     }
3735
3736
3737 cleanupAndReturn:
3738     if (failed) {
3739         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3740             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3741         // callerPattern->dump();
3742     }
3743     delete parseMatcher;
3744     delete parsePat;
3745     delete UTF8Matcher;
3746     delete UTF8Pattern;
3747     delete matcher;
3748     delete callerPattern;
3749
3750     utext_close(&inputText);
3751     delete[] inputChars;
3752     utext_close(&patternText);
3753     delete[] patternChars;
3754     ucnv_close(UTF8Converter);
3755 }
3756
3757
3758
3759
3760 //---------------------------------------------------------------------------
3761 //
3762 //      Errors     Check for error handling in patterns.
3763 //
3764 //---------------------------------------------------------------------------
3765 void RegexTest::Errors() {
3766     // \escape sequences that aren't implemented yet.
3767     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3768
3769     // Missing close parentheses
3770     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3771     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3772     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3773
3774     // Extra close paren
3775     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3776     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3777     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3778
3779     // Look-ahead, Look-behind
3780     //  TODO:  add tests for unbounded length look-behinds.
3781     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3782
3783     // Attempt to use non-default flags
3784     {
3785         UParseError   pe;
3786         UErrorCode    status = U_ZERO_ERROR;
3787         int32_t       flags  = UREGEX_CANON_EQ |
3788                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3789                                UREGEX_MULTILINE;
3790         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3791         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3792         delete pat1;
3793     }
3794
3795
3796     // Quantifiers are allowed only after something that can be quantified.
3797     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3798     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3799     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3800
3801     // Mal-formed {min,max} quantifiers
3802     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3803     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3804     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3805     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3806     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3807     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3808     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3809     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3810     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3811
3812     // Ticket 5389
3813     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3814
3815     // Invalid Back Reference \0
3816     //    For ICU 3.8 and earlier
3817     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3818     //
3819     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3820
3821 }
3822
3823
3824 //-------------------------------------------------------------------------------
3825 //
3826 //  Read a text data file, convert it to UChars, and return the data
3827 //    in one big UChar * buffer, which the caller must delete.
3828 //
3829 //--------------------------------------------------------------------------------
3830 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3831                                      const char *defEncoding, UErrorCode &status) {
3832     UChar       *retPtr  = NULL;
3833     char        *fileBuf = NULL;
3834     UConverter* conv     = NULL;
3835     FILE        *f       = NULL;
3836
3837     ulen = 0;
3838     if (U_FAILURE(status)) {
3839         return retPtr;
3840     }
3841
3842     //
3843     //  Open the file.
3844     //
3845     f = fopen(fileName, "rb");
3846     if (f == 0) {
3847         dataerrln("Error opening test data file %s\n", fileName);
3848         status = U_FILE_ACCESS_ERROR;
3849         return NULL;
3850     }
3851     //
3852     //  Read it in
3853     //
3854     int32_t            fileSize;
3855     int32_t            amt_read;
3856
3857     fseek( f, 0, SEEK_END);
3858     fileSize = ftell(f);
3859     fileBuf = new char[fileSize];
3860     fseek(f, 0, SEEK_SET);
3861     amt_read = fread(fileBuf, 1, fileSize, f);
3862     if (amt_read != fileSize || fileSize <= 0) {
3863         errln("Error reading test data file.");
3864         goto cleanUpAndReturn;
3865     }
3866
3867     //
3868     // Look for a Unicode Signature (BOM) on the data just read
3869     //
3870     int32_t        signatureLength;
3871     const char *   fileBufC;
3872     const char*    encoding;
3873
3874     fileBufC = fileBuf;
3875     encoding = ucnv_detectUnicodeSignature(
3876         fileBuf, fileSize, &signatureLength, &status);
3877     if(encoding!=NULL ){
3878         fileBufC  += signatureLength;
3879         fileSize  -= signatureLength;
3880     } else {
3881         encoding = defEncoding;
3882         if (strcmp(encoding, "utf-8") == 0) {
3883             errln("file %s is missing its BOM", fileName);
3884         }
3885     }
3886
3887     //
3888     // Open a converter to take the rule file to UTF-16
3889     //
3890     conv = ucnv_open(encoding, &status);
3891     if (U_FAILURE(status)) {
3892         goto cleanUpAndReturn;
3893     }
3894
3895     //
3896     // Convert the rules to UChar.
3897     //  Preflight first to determine required buffer size.
3898     //
3899     ulen = ucnv_toUChars(conv,
3900         NULL,           //  dest,
3901         0,              //  destCapacity,
3902         fileBufC,
3903         fileSize,
3904         &status);
3905     if (status == U_BUFFER_OVERFLOW_ERROR) {
3906         // Buffer Overflow is expected from the preflight operation.
3907         status = U_ZERO_ERROR;
3908
3909         retPtr = new UChar[ulen+1];
3910         ucnv_toUChars(conv,
3911             retPtr,       //  dest,
3912             ulen+1,
3913             fileBufC,
3914             fileSize,
3915             &status);
3916     }
3917
3918 cleanUpAndReturn:
3919     fclose(f);
3920     delete[] fileBuf;
3921     ucnv_close(conv);
3922     if (U_FAILURE(status)) {
3923         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3924         delete []retPtr;
3925         retPtr = 0;
3926         ulen   = 0;
3927     };
3928     return retPtr;
3929 }
3930
3931
3932 //-------------------------------------------------------------------------------
3933 //
3934 //   PerlTests  - Run Perl's regular expression tests
3935 //                The input file for this test is re_tests, the standard regular
3936 //                expression test data distributed with the Perl source code.
3937 //
3938 //                Here is Perl's description of the test data file:
3939 //
3940 //        # The tests are in a separate file 't/op/re_tests'.
3941 //        # Each line in that file is a separate test.
3942 //        # There are five columns, separated by tabs.
3943 //        #
3944 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3945 //        # Modifiers can be put after the closing C<'>.
3946 //        #
3947 //        # Column 2 contains the string to be matched.
3948 //        #
3949 //        # Column 3 contains the expected result:
3950 //        #     y   expect a match
3951 //        #     n   expect no match
3952 //        #     c   expect an error
3953 //        # B   test exposes a known bug in Perl, should be skipped
3954 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3955 //        #
3956 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3957 //        #
3958 //        # Column 4 contains a string, usually C<$&>.
3959 //        #
3960 //        # Column 5 contains the expected result of double-quote
3961 //        # interpolating that string after the match, or start of error message.
3962 //        #
3963 //        # Column 6, if present, contains a reason why the test is skipped.
3964 //        # This is printed with "skipped", for harness to pick up.
3965 //        #
3966 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3967 //        #
3968 //        # If you want to add a regular expression test that can't be expressed
3969 //        # in this format, don't add it here: put it in op/pat.t instead.
3970 //
3971 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3972 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3973 //        (The i is in addition to whatever was there before.)
3974 //
3975 //-------------------------------------------------------------------------------
3976 void RegexTest::PerlTests() {
3977     char tdd[2048];
3978     const char *srcPath;
3979     UErrorCode  status = U_ZERO_ERROR;
3980     UParseError pe;
3981
3982     //
3983     //  Open and read the test data file.
3984     //
3985     srcPath=getPath(tdd, "re_tests.txt");
3986     if(srcPath==NULL) {
3987         return; /* something went wrong, error already output */
3988     }
3989
3990     int32_t    len;
3991     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3992     if (U_FAILURE(status)) {
3993         return; /* something went wrong, error already output */
3994     }
3995
3996     //
3997     //  Put the test data into a UnicodeString
3998     //
3999     UnicodeString testDataString(FALSE, testData, len);
4000
4001     //
4002     //  Regex to break the input file into lines, and strip the new lines.
4003     //     One line per match, capture group one is the desired data.
4004     //
4005     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4006     if (U_FAILURE(status)) {
4007         dataerrln("RegexPattern::compile() error");
4008         return;
4009     }
4010     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4011
4012     //
4013     //  Regex to split a test file line into fields.
4014     //    There are six fields, separated by tabs.
4015     //
4016     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4017
4018     //
4019     //  Regex to identify test patterns with flag settings, and to separate them.
4020     //    Test patterns with flags look like 'pattern'i
4021     //    Test patterns without flags are not quoted:   pattern
4022     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4023     //
4024     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4025     RegexMatcher* flagMat = flagPat->matcher(status);
4026
4027     //
4028     // The Perl tests reference several perl-isms, which are evaluated/substituted
4029     //   in the test data.  Not being perl, this must be done explicitly.  Here
4030     //   are string constants and REs for these constructs.
4031     //
4032     UnicodeString nulnulSrc("${nulnul}");
4033     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4034     nulnul = nulnul.unescape();
4035
4036     UnicodeString ffffSrc("${ffff}");
4037     UnicodeString ffff("\\uffff", -1, US_INV);
4038     ffff = ffff.unescape();
4039
4040     //  regexp for $-[0], $+[2], etc.
4041     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4042     RegexMatcher *groupsMat = groupsPat->matcher(status);
4043
4044     //  regexp for $0, $1, $2, etc.
4045     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4046     RegexMatcher *cgMat = cgPat->matcher(status);
4047
4048
4049     //
4050     // Main Loop for the Perl Tests, runs once per line from the
4051     //   test data file.
4052     //
4053     int32_t  lineNum = 0;
4054     int32_t  skippedUnimplementedCount = 0;
4055     while (lineMat->find()) {
4056         lineNum++;
4057
4058         //
4059         //  Get a line, break it into its fields, do the Perl
4060         //    variable substitutions.
4061         //
4062         UnicodeString line = lineMat->group(1, status);
4063         UnicodeString fields[7];
4064         fieldPat->split(line, fields, 7, status);
4065
4066         flagMat->reset(fields[0]);
4067         flagMat->matches(status);
4068         UnicodeString pattern  = flagMat->group(2, status);
4069         pattern.findAndReplace("${bang}", "!");
4070         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4071         pattern.findAndReplace(ffffSrc, ffff);
4072
4073         //
4074         //  Identify patterns that include match flag settings,
4075         //    split off the flags, remove the extra quotes.
4076         //
4077         UnicodeString flagStr = flagMat->group(3, status);
4078         if (U_FAILURE(status)) {
4079             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4080             return;
4081         }
4082         int32_t flags = 0;
4083         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4084         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4085         const UChar UChar_m = 0x6d;
4086         const UChar UChar_x = 0x78;
4087         const UChar UChar_y = 0x79;
4088         if (flagStr.indexOf(UChar_i) != -1) {
4089             flags |= UREGEX_CASE_INSENSITIVE;
4090         }
4091         if (flagStr.indexOf(UChar_m) != -1) {
4092             flags |= UREGEX_MULTILINE;
4093         }
4094         if (flagStr.indexOf(UChar_x) != -1) {
4095             flags |= UREGEX_COMMENTS;
4096         }
4097
4098         //
4099         // Compile the test pattern.
4100         //
4101         status = U_ZERO_ERROR;
4102         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4103         if (status == U_REGEX_UNIMPLEMENTED) {
4104             //
4105             // Test of a feature that is planned for ICU, but not yet implemented.
4106             //   skip the test.
4107             skippedUnimplementedCount++;
4108             delete testPat;
4109             status = U_ZERO_ERROR;
4110             continue;
4111         }
4112
4113         if (U_FAILURE(status)) {
4114             // Some tests are supposed to generate errors.
4115             //   Only report an error for tests that are supposed to succeed.
4116             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4117                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4118             {
4119                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4120             }
4121             status = U_ZERO_ERROR;
4122             delete testPat;
4123             continue;
4124         }
4125
4126         if (fields[2].indexOf(UChar_i) >= 0) {
4127             // ICU should skip this test.
4128             delete testPat;
4129             continue;
4130         }
4131
4132         if (fields[2].indexOf(UChar_c) >= 0) {
4133             // This pattern should have caused a compilation error, but didn't/
4134             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4135             delete testPat;
4136             continue;
4137         }
4138
4139         //
4140         // replace the Perl variables that appear in some of the
4141         //   match data strings.
4142         //
4143         UnicodeString matchString = fields[1];
4144         matchString.findAndReplace(nulnulSrc, nulnul);
4145         matchString.findAndReplace(ffffSrc,   ffff);
4146
4147         // Replace any \n in the match string with an actual new-line char.
4148         //  Don't do full unescape, as this unescapes more than Perl does, which
4149         //  causes other spurious failures in the tests.
4150         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4151
4152
4153
4154         //
4155         // Run the test, check for expected match/don't match result.
4156         //
4157         RegexMatcher *testMat = testPat->matcher(matchString, status);
4158         UBool found = testMat->find();
4159         UBool expected = FALSE;
4160         if (fields[2].indexOf(UChar_y) >=0) {
4161             expected = TRUE;
4162         }
4163         if (expected != found) {
4164             errln("line %d: Expected %smatch, got %smatch",
4165                 lineNum, expected?"":"no ", found?"":"no " );
4166             continue;
4167         }
4168
4169         // Don't try to check expected results if there is no match.
4170         //   (Some have stuff in the expected fields)
4171         if (!found) {
4172             delete testMat;
4173             delete testPat;
4174             continue;
4175         }
4176
4177         //
4178         // Interpret the Perl expression from the fourth field of the data file,
4179         // building up an ICU string from the results of the ICU match.
4180         //   The Perl expression will contain references to the results of
4181         //     a regex match, including the matched string, capture group strings,
4182         //     group starting and ending indicies, etc.
4183         //
4184         UnicodeString resultString;
4185         UnicodeString perlExpr = fields[3];
4186 #if SUPPORT_MUTATING_INPUT_STRING
4187         groupsMat->reset(perlExpr);
4188         cgMat->reset(perlExpr);
4189 #endif
4190
4191         while (perlExpr.length() > 0) {
4192 #if !SUPPORT_MUTATING_INPUT_STRING
4193             //  Perferred usage.  Reset after any modification to input string.
4194             groupsMat->reset(perlExpr);
4195             cgMat->reset(perlExpr);
4196 #endif
4197
4198             if (perlExpr.startsWith("$&")) {
4199                 resultString.append(testMat->group(status));
4200                 perlExpr.remove(0, 2);
4201             }
4202
4203             else if (groupsMat->lookingAt(status)) {
4204                 // $-[0]   $+[2]  etc.
4205                 UnicodeString digitString = groupsMat->group(2, status);
4206                 int32_t t = 0;
4207                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4208                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4209                 int32_t matchPosition;
4210                 if (plusOrMinus.compare("+") == 0) {
4211                     matchPosition = testMat->end(groupNum, status);
4212                 } else {
4213                     matchPosition = testMat->start(groupNum, status);
4214                 }
4215                 if (matchPosition != -1) {
4216                     ICU_Utility::appendNumber(resultString, matchPosition);
4217                 }
4218                 perlExpr.remove(0, groupsMat->end(status));
4219             }
4220
4221             else if (cgMat->lookingAt(status)) {
4222                 // $1, $2, $3, etc.
4223                 UnicodeString digitString = cgMat->group(1, status);
4224                 int32_t t = 0;
4225                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4226                 if (U_SUCCESS(status)) {
4227                     resultString.append(testMat->group(groupNum, status));
4228                     status = U_ZERO_ERROR;
4229                 }
4230                 perlExpr.remove(0, cgMat->end(status));
4231             }
4232
4233             else if (perlExpr.startsWith("@-")) {
4234                 int32_t i;
4235                 for (i=0; i<=testMat->groupCount(); i++) {
4236                     if (i>0) {
4237                         resultString.append(" ");
4238                     }
4239                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4240                 }
4241                 perlExpr.remove(0, 2);
4242             }
4243
4244             else if (perlExpr.startsWith("@+")) {
4245                 int32_t i;
4246                 for (i=0; i<=testMat->groupCount(); i++) {
4247                     if (i>0) {
4248                         resultString.append(" ");
4249                     }
4250                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4251                 }
4252                 perlExpr.remove(0, 2);
4253             }
4254
4255             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4256                                                      //           or as an escaped sequence (e.g. \n)
4257                 if (perlExpr.length() > 1) {
4258                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4259                 }
4260                 UChar c = perlExpr.charAt(0);
4261                 switch (c) {
4262                 case 'n':   c = '\n'; break;
4263                 // add any other escape sequences that show up in the test expected results.
4264                 }
4265                 resultString.append(c);
4266                 perlExpr.remove(0, 1);
4267             }
4268
4269             else  {
4270                 // Any characters from the perl expression that we don't explicitly
4271                 //  recognize before here are assumed to be literals and copied
4272                 //  as-is to the expected results.
4273                 resultString.append(perlExpr.charAt(0));
4274                 perlExpr.remove(0, 1);
4275             }
4276
4277             if (U_FAILURE(status)) {
4278                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4279                 break;
4280             }
4281         }
4282
4283         //
4284         // Expected Results Compare
4285         //
4286         UnicodeString expectedS(fields[4]);
4287         expectedS.findAndReplace(nulnulSrc, nulnul);
4288         expectedS.findAndReplace(ffffSrc,   ffff);
4289         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4290
4291
4292         if (expectedS.compare(resultString) != 0) {
4293             err("Line %d: Incorrect perl expression results.", lineNum);
4294             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4295         }
4296
4297         delete testMat;
4298         delete testPat;
4299     }
4300
4301     //
4302     // All done.  Clean up allocated stuff.
4303     //
4304     delete cgMat;
4305     delete cgPat;
4306
4307     delete groupsMat;
4308     delete groupsPat;
4309
4310     delete flagMat;
4311     delete flagPat;
4312
4313     delete lineMat;
4314     delete linePat;
4315
4316     delete fieldPat;
4317     delete [] testData;
4318
4319
4320     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4321
4322 }
4323
4324
4325 //-------------------------------------------------------------------------------
4326 //
4327 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4328 //                  (instead of using UnicodeStrings) to test the alternate engine.
4329 //                  The input file for this test is re_tests, the standard regular
4330 //                  expression test data distributed with the Perl source code.
4331 //                  See PerlTests() for more information.
4332 //
4333 //-------------------------------------------------------------------------------
4334 void RegexTest::PerlTestsUTF8() {
4335     char tdd[2048];
4336     const char *srcPath;
4337     UErrorCode  status = U_ZERO_ERROR;
4338     UParseError pe;
4339     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4340     UText       patternText = UTEXT_INITIALIZER;
4341     char       *patternChars = NULL;
4342     int32_t     patternLength;
4343     int32_t     patternCapacity = 0;
4344     UText       inputText = UTEXT_INITIALIZER;
4345     char       *inputChars = NULL;
4346     int32_t     inputLength;
4347     int32_t     inputCapacity = 0;
4348
4349     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4350
4351     //
4352     //  Open and read the test data file.
4353     //
4354     srcPath=getPath(tdd, "re_tests.txt");
4355     if(srcPath==NULL) {
4356         return; /* something went wrong, error already output */
4357     }
4358
4359     int32_t    len;
4360     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4361     if (U_FAILURE(status)) {
4362         return; /* something went wrong, error already output */
4363     }
4364
4365     //
4366     //  Put the test data into a UnicodeString
4367     //
4368     UnicodeString testDataString(FALSE, testData, len);
4369
4370     //
4371     //  Regex to break the input file into lines, and strip the new lines.
4372     //     One line per match, capture group one is the desired data.
4373     //
4374     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4375     if (U_FAILURE(status)) {
4376         dataerrln("RegexPattern::compile() error");
4377         return;
4378     }
4379     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4380
4381     //
4382     //  Regex to split a test file line into fields.
4383     //    There are six fields, separated by tabs.
4384     //
4385     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4386
4387     //
4388     //  Regex to identify test patterns with flag settings, and to separate them.
4389     //    Test patterns with flags look like 'pattern'i
4390     //    Test patterns without flags are not quoted:   pattern
4391     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4392     //
4393     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4394     RegexMatcher* flagMat = flagPat->matcher(status);
4395
4396     //
4397     // The Perl tests reference several perl-isms, which are evaluated/substituted
4398     //   in the test data.  Not being perl, this must be done explicitly.  Here
4399     //   are string constants and REs for these constructs.
4400     //
4401     UnicodeString nulnulSrc("${nulnul}");
4402     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4403     nulnul = nulnul.unescape();
4404
4405     UnicodeString ffffSrc("${ffff}");
4406     UnicodeString ffff("\\uffff", -1, US_INV);
4407     ffff = ffff.unescape();
4408
4409     //  regexp for $-[0], $+[2], etc.
4410     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4411     RegexMatcher *groupsMat = groupsPat->matcher(status);
4412
4413     //  regexp for $0, $1, $2, etc.
4414     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4415     RegexMatcher *cgMat = cgPat->matcher(status);
4416
4417
4418     //
4419     // Main Loop for the Perl Tests, runs once per line from the
4420     //   test data file.
4421     //
4422     int32_t  lineNum = 0;
4423     int32_t  skippedUnimplementedCount = 0;
4424     while (lineMat->find()) {
4425         lineNum++;
4426
4427         //
4428         //  Get a line, break it into its fields, do the Perl
4429         //    variable substitutions.
4430         //
4431         UnicodeString line = lineMat->group(1, status);
4432         UnicodeString fields[7];
4433         fieldPat->split(line, fields, 7, status);
4434
4435         flagMat->reset(fields[0]);
4436         flagMat->matches(status);
4437         UnicodeString pattern  = flagMat->group(2, status);
4438         pattern.findAndReplace("${bang}", "!");
4439         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4440         pattern.findAndReplace(ffffSrc, ffff);
4441
4442         //
4443         //  Identify patterns that include match flag settings,
4444         //    split off the flags, remove the extra quotes.
4445         //
4446         UnicodeString flagStr = flagMat->group(3, status);
4447         if (U_FAILURE(status)) {
4448             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4449             return;
4450         }
4451         int32_t flags = 0;
4452         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4453         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4454         const UChar UChar_m = 0x6d;
4455         const UChar UChar_x = 0x78;
4456         const UChar UChar_y = 0x79;
4457         if (flagStr.indexOf(UChar_i) != -1) {
4458             flags |= UREGEX_CASE_INSENSITIVE;
4459         }
4460         if (flagStr.indexOf(UChar_m) != -1) {
4461             flags |= UREGEX_MULTILINE;
4462         }
4463         if (flagStr.indexOf(UChar_x) != -1) {
4464             flags |= UREGEX_COMMENTS;
4465         }
4466
4467         //
4468         // Put the pattern in a UTF-8 UText
4469         //
4470         status = U_ZERO_ERROR;
4471         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4472         if (status == U_BUFFER_OVERFLOW_ERROR) {
4473             status = U_ZERO_ERROR;
4474             delete[] patternChars;
4475             patternCapacity = patternLength + 1;
4476             patternChars = new char[patternCapacity];
4477             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4478         }
4479         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4480
4481         //
4482         // Compile the test pattern.
4483         //
4484         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4485         if (status == U_REGEX_UNIMPLEMENTED) {
4486             //
4487             // Test of a feature that is planned for ICU, but not yet implemented.
4488             //   skip the test.
4489             skippedUnimplementedCount++;
4490             delete testPat;
4491             status = U_ZERO_ERROR;
4492             continue;
4493         }
4494
4495         if (U_FAILURE(status)) {
4496             // Some tests are supposed to generate errors.
4497             //   Only report an error for tests that are supposed to succeed.
4498             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4499                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4500             {
4501                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4502             }
4503             status = U_ZERO_ERROR;
4504             delete testPat;
4505             continue;
4506         }
4507
4508         if (fields[2].indexOf(UChar_i) >= 0) {
4509             // ICU should skip this test.
4510             delete testPat;
4511             continue;
4512         }
4513
4514         if (fields[2].indexOf(UChar_c) >= 0) {
4515             // This pattern should have caused a compilation error, but didn't/
4516             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4517             delete testPat;
4518             continue;
4519         }
4520
4521
4522         //
4523         // replace the Perl variables that appear in some of the
4524         //   match data strings.
4525         //
4526         UnicodeString matchString = fields[1];
4527         matchString.findAndReplace(nulnulSrc, nulnul);
4528         matchString.findAndReplace(ffffSrc,   ffff);
4529
4530         // Replace any \n in the match string with an actual new-line char.
4531         //  Don't do full unescape, as this unescapes more than Perl does, which
4532         //  causes other spurious failures in the tests.
4533         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4534
4535         //
4536         // Put the input in a UTF-8 UText
4537         //
4538         status = U_ZERO_ERROR;
4539         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4540         if (status == U_BUFFER_OVERFLOW_ERROR) {
4541             status = U_ZERO_ERROR;
4542             delete[] inputChars;
4543             inputCapacity = inputLength + 1;
4544             inputChars = new char[inputCapacity];
4545             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4546         }
4547         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4548
4549         //
4550         // Run the test, check for expected match/don't match result.
4551         //
4552         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4553         UBool found = testMat->find();
4554         UBool expected = FALSE;
4555         if (fields[2].indexOf(UChar_y) >=0) {
4556             expected = TRUE;
4557         }
4558         if (expected != found) {
4559             errln("line %d: Expected %smatch, got %smatch",
4560                 lineNum, expected?"":"no ", found?"":"no " );
4561             continue;
4562         }
4563
4564         // Don't try to check expected results if there is no match.
4565         //   (Some have stuff in the expected fields)
4566         if (!found) {
4567             delete testMat;
4568             delete testPat;
4569             continue;
4570         }
4571
4572         //
4573         // Interpret the Perl expression from the fourth field of the data file,
4574         // building up an ICU string from the results of the ICU match.
4575         //   The Perl expression will contain references to the results of
4576         //     a regex match, including the matched string, capture group strings,
4577         //     group starting and ending indicies, etc.
4578         //
4579         UnicodeString resultString;
4580         UnicodeString perlExpr = fields[3];
4581
4582         while (perlExpr.length() > 0) {
4583             groupsMat->reset(perlExpr);
4584             cgMat->reset(perlExpr);
4585
4586             if (perlExpr.startsWith("$&")) {
4587                 resultString.append(testMat->group(status));
4588                 perlExpr.remove(0, 2);
4589             }
4590
4591             else if (groupsMat->lookingAt(status)) {
4592                 // $-[0]   $+[2]  etc.
4593                 UnicodeString digitString = groupsMat->group(2, status);
4594                 int32_t t = 0;
4595                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4596                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4597                 int32_t matchPosition;
4598                 if (plusOrMinus.compare("+") == 0) {
4599                     matchPosition = testMat->end(groupNum, status);
4600                 } else {
4601                     matchPosition = testMat->start(groupNum, status);
4602                 }
4603                 if (matchPosition != -1) {
4604                     ICU_Utility::appendNumber(resultString, matchPosition);
4605                 }
4606                 perlExpr.remove(0, groupsMat->end(status));
4607             }
4608
4609             else if (cgMat->lookingAt(status)) {
4610                 // $1, $2, $3, etc.
4611                 UnicodeString digitString = cgMat->group(1, status);
4612                 int32_t t = 0;
4613                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4614                 if (U_SUCCESS(status)) {
4615                     resultString.append(testMat->group(groupNum, status));
4616                     status = U_ZERO_ERROR;
4617                 }
4618                 perlExpr.remove(0, cgMat->end(status));
4619             }
4620
4621             else if (perlExpr.startsWith("@-")) {
4622                 int32_t i;
4623                 for (i=0; i<=testMat->groupCount(); i++) {
4624                     if (i>0) {
4625                         resultString.append(" ");
4626                     }
4627                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4628                 }
4629                 perlExpr.remove(0, 2);
4630             }
4631
4632             else if (perlExpr.startsWith("@+")) {
4633                 int32_t i;
4634                 for (i=0; i<=testMat->groupCount(); i++) {
4635                     if (i>0) {
4636                         resultString.append(" ");
4637                     }
4638                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4639                 }
4640                 perlExpr.remove(0, 2);
4641             }
4642
4643             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4644                                                      //           or as an escaped sequence (e.g. \n)
4645                 if (perlExpr.length() > 1) {
4646                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4647                 }
4648                 UChar c = perlExpr.charAt(0);
4649                 switch (c) {
4650                 case 'n':   c = '\n'; break;
4651                 // add any other escape sequences that show up in the test expected results.
4652                 }
4653                 resultString.append(c);
4654                 perlExpr.remove(0, 1);
4655             }
4656
4657             else  {
4658                 // Any characters from the perl expression that we don't explicitly
4659                 //  recognize before here are assumed to be literals and copied
4660                 //  as-is to the expected results.
4661                 resultString.append(perlExpr.charAt(0));
4662                 perlExpr.remove(0, 1);
4663             }
4664
4665             if (U_FAILURE(status)) {
4666                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4667                 break;
4668             }
4669         }
4670
4671         //
4672         // Expected Results Compare
4673         //
4674         UnicodeString expectedS(fields[4]);
4675         expectedS.findAndReplace(nulnulSrc, nulnul);
4676         expectedS.findAndReplace(ffffSrc,   ffff);
4677         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4678
4679
4680         if (expectedS.compare(resultString) != 0) {
4681             err("Line %d: Incorrect perl expression results.", lineNum);
4682             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4683         }
4684
4685         delete testMat;
4686         delete testPat;
4687     }
4688
4689     //
4690     // All done.  Clean up allocated stuff.
4691     //
4692     delete cgMat;
4693     delete cgPat;
4694
4695     delete groupsMat;
4696     delete groupsPat;
4697
4698     delete flagMat;
4699     delete flagPat;
4700
4701     delete lineMat;
4702     delete linePat;
4703
4704     delete fieldPat;
4705     delete [] testData;
4706
4707     utext_close(&patternText);
4708     utext_close(&inputText);
4709
4710     delete [] patternChars;
4711     delete [] inputChars;
4712
4713
4714     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4715
4716 }
4717
4718
4719 //--------------------------------------------------------------
4720 //
4721 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4722 //             Use this pattern,
4723 //                 "(a?){1,8000000}"
4724 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4725 //                   This test is likely to be fragile, as further optimizations stop
4726 //                   more cases of pointless looping in the match engine.
4727 //
4728 //---------------------------------------------------------------
4729 void RegexTest::Bug6149() {
4730     UnicodeString pattern("(a?){1,8000000}");
4731     UnicodeString s("xyz");
4732     uint32_t flags = 0;
4733     UErrorCode status = U_ZERO_ERROR;
4734
4735     RegexMatcher  matcher(pattern, s, flags, status);
4736     UBool result = false;
4737     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4738     REGEX_ASSERT(result == FALSE);
4739  }
4740
4741
4742 //
4743 //   Callbacks()    Test the callback function.
4744 //                  When set, callbacks occur periodically during matching operations,
4745 //                  giving the application code the ability to abort the operation
4746 //                  before it's normal completion.
4747 //
4748
4749 struct callBackContext {
4750     RegexTest        *test;
4751     int32_t          maxCalls;
4752     int32_t          numCalls;
4753     int32_t          lastSteps;
4754     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4755 };
4756
4757 U_CDECL_BEGIN
4758 static UBool U_CALLCONV
4759 testCallBackFn(const void *context, int32_t steps) {
4760     callBackContext  *info = (callBackContext *)context;
4761     if (info->lastSteps+1 != steps) {
4762         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4763     }
4764     info->lastSteps = steps;
4765     info->numCalls++;
4766     return (info->numCalls < info->maxCalls);
4767 }
4768 U_CDECL_END
4769
4770 void RegexTest::Callbacks() {
4771    {
4772         // Getter returns NULLs if no callback has been set
4773
4774         //   The variables that the getter will fill in.
4775         //   Init to non-null values so that the action of the getter can be seen.
4776         const void          *returnedContext = &returnedContext;
4777         URegexMatchCallback *returnedFn = &testCallBackFn;
4778
4779         UErrorCode status = U_ZERO_ERROR;
4780         RegexMatcher matcher("x", 0, status);
4781         REGEX_CHECK_STATUS;
4782         matcher.getMatchCallback(returnedFn, returnedContext, status);
4783         REGEX_CHECK_STATUS;
4784         REGEX_ASSERT(returnedFn == NULL);
4785         REGEX_ASSERT(returnedContext == NULL);
4786     }
4787
4788    {
4789         // Set and Get work
4790         callBackContext cbInfo = {this, 0, 0, 0};
4791         const void          *returnedContext;
4792         URegexMatchCallback *returnedFn;
4793         UErrorCode status = U_ZERO_ERROR;
4794         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4795         REGEX_CHECK_STATUS;
4796         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4797         REGEX_CHECK_STATUS;
4798         matcher.getMatchCallback(returnedFn, returnedContext, status);
4799         REGEX_CHECK_STATUS;
4800         REGEX_ASSERT(returnedFn == testCallBackFn);
4801         REGEX_ASSERT(returnedContext == &cbInfo);
4802
4803         // A short-running match shouldn't invoke the callback
4804         status = U_ZERO_ERROR;
4805         cbInfo.reset(1);
4806         UnicodeString s = "xxx";
4807         matcher.reset(s);
4808         REGEX_ASSERT(matcher.matches(status));
4809         REGEX_CHECK_STATUS;
4810         REGEX_ASSERT(cbInfo.numCalls == 0);
4811
4812         // A medium-length match that runs long enough to invoke the
4813         //   callback, but not so long that the callback aborts it.
4814         status = U_ZERO_ERROR;
4815         cbInfo.reset(4);
4816         s = "aaaaaaaaaaaaaaaaaaab";
4817         matcher.reset(s);
4818         REGEX_ASSERT(matcher.matches(status)==FALSE);
4819         REGEX_CHECK_STATUS;
4820         REGEX_ASSERT(cbInfo.numCalls > 0);
4821
4822         // A longer running match that the callback function will abort.
4823         status = U_ZERO_ERROR;
4824         cbInfo.reset(4);
4825         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4826         matcher.reset(s);
4827         REGEX_ASSERT(matcher.matches(status)==FALSE);
4828         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4829         REGEX_ASSERT(cbInfo.numCalls == 4);
4830
4831         // A longer running find that the callback function will abort.
4832         status = U_ZERO_ERROR;
4833         cbInfo.reset(4);
4834         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4835         matcher.reset(s);
4836         REGEX_ASSERT(matcher.find(status)==FALSE);
4837         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4838         REGEX_ASSERT(cbInfo.numCalls == 4);
4839     }
4840
4841
4842 }
4843
4844
4845 //
4846 //   FindProgressCallbacks()    Test the find "progress" callback function.
4847 //                  When set, the find progress callback will be invoked during a find operations
4848 //                  after each return from a match attempt, giving the application the opportunity
4849 //                  to terminate a long-running find operation before it's normal completion.
4850 //
4851
4852 struct progressCallBackContext {
4853     RegexTest        *test;
4854     int64_t          lastIndex;
4855     int32_t          maxCalls;
4856     int32_t          numCalls;
4857     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4858 };
4859
4860 // call-back function for find().
4861 // Return TRUE to continue the find().
4862 // Return FALSE to stop the find().
4863 U_CDECL_BEGIN
4864 static UBool U_CALLCONV
4865 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4866     progressCallBackContext  *info = (progressCallBackContext *)context;
4867     info->numCalls++;
4868     info->lastIndex = matchIndex;
4869 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4870     return (info->numCalls < info->maxCalls);
4871 }
4872 U_CDECL_END
4873
4874 void RegexTest::FindProgressCallbacks() {
4875    {
4876         // Getter returns NULLs if no callback has been set
4877
4878         //   The variables that the getter will fill in.
4879         //   Init to non-null values so that the action of the getter can be seen.
4880         const void                  *returnedContext = &returnedContext;
4881         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4882
4883         UErrorCode status = U_ZERO_ERROR;
4884         RegexMatcher matcher("x", 0, status);
4885         REGEX_CHECK_STATUS;
4886         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4887         REGEX_CHECK_STATUS;
4888         REGEX_ASSERT(returnedFn == NULL);
4889         REGEX_ASSERT(returnedContext == NULL);
4890     }
4891
4892    {
4893         // Set and Get work
4894         progressCallBackContext cbInfo = {this, 0, 0, 0};
4895         const void                  *returnedContext;
4896         URegexFindProgressCallback  *returnedFn;
4897         UErrorCode status = U_ZERO_ERROR;
4898         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4899         REGEX_CHECK_STATUS;
4900         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4901         REGEX_CHECK_STATUS;
4902         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4903         REGEX_CHECK_STATUS;
4904         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4905         REGEX_ASSERT(returnedContext == &cbInfo);
4906
4907         // A find that matches on the initial position does NOT invoke the callback.
4908         status = U_ZERO_ERROR;
4909         cbInfo.reset(100);
4910         UnicodeString s = "aaxxx";
4911         matcher.reset(s);
4912 #if 0
4913         matcher.setTrace(TRUE);
4914 #endif
4915         REGEX_ASSERT(matcher.find(0, status));
4916         REGEX_CHECK_STATUS;
4917         REGEX_ASSERT(cbInfo.numCalls == 0);
4918
4919         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4920         //   but not so many times that we interrupt the operation.
4921         status = U_ZERO_ERROR;
4922         s = "aaaaaaaaaaaaaaaaaaab";
4923         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4924         matcher.reset(s);
4925         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4926         REGEX_CHECK_STATUS;
4927         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4928
4929         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4930         status = U_ZERO_ERROR;
4931         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4932         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4933         matcher.reset(s1);
4934         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4935         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4936         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4937
4938         // Now a match that will succeed, but after an interruption
4939         status = U_ZERO_ERROR;
4940         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4941         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4942         matcher.reset(s2);
4943         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4944         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4945         // Now retry the match from where left off
4946         cbInfo.maxCalls = 100; //  No callback limit
4947         status = U_ZERO_ERROR;
4948         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4949         REGEX_CHECK_STATUS;
4950     }
4951
4952
4953 }
4954
4955
4956 //---------------------------------------------------------------------------
4957 //
4958 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4959 //                             UTexts. The pure-C implementation of UText
4960 //                             has no mutable backing stores, but we can
4961 //                             use UnicodeString here to test the functionality.
4962 //
4963 //---------------------------------------------------------------------------
4964 void RegexTest::PreAllocatedUTextCAPI () {
4965     UErrorCode           status = U_ZERO_ERROR;
4966     URegularExpression  *re;
4967     UText                patternText = UTEXT_INITIALIZER;
4968     UnicodeString        buffer;
4969     UText                bufferText = UTEXT_INITIALIZER;
4970
4971     utext_openUnicodeString(&bufferText, &buffer, &status);
4972
4973     /*
4974      *  getText() and getUText()
4975      */
4976     {
4977         UText  text1 = UTEXT_INITIALIZER;
4978         UText  text2 = UTEXT_INITIALIZER;
4979         UChar  text2Chars[20];
4980         UText  *resultText;
4981
4982         status = U_ZERO_ERROR;
4983         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4984         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4985         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4986         utext_openUChars(&text2, text2Chars, -1, &status);
4987
4988         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4989         re = uregex_openUText(&patternText, 0, NULL, &status);
4990
4991         /* First set a UText */
4992         uregex_setUText(re, &text1, &status);
4993         resultText = uregex_getUText(re, &bufferText, &status);
4994         REGEX_CHECK_STATUS;
4995         REGEX_ASSERT(resultText == &bufferText);
4996         utext_setNativeIndex(resultText, 0);
4997         utext_setNativeIndex(&text1, 0);
4998         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4999
5000         resultText = uregex_getUText(re, &bufferText, &status);
5001         REGEX_CHECK_STATUS;
5002         REGEX_ASSERT(resultText == &bufferText);
5003         utext_setNativeIndex(resultText, 0);
5004         utext_setNativeIndex(&text1, 0);
5005         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5006
5007         /* Then set a UChar * */
5008         uregex_setText(re, text2Chars, 7, &status);
5009         resultText = uregex_getUText(re, &bufferText, &status);
5010         REGEX_CHECK_STATUS;
5011         REGEX_ASSERT(resultText == &bufferText);
5012         utext_setNativeIndex(resultText, 0);
5013         utext_setNativeIndex(&text2, 0);
5014         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5015
5016         uregex_close(re);
5017         utext_close(&text1);
5018         utext_close(&text2);
5019     }
5020
5021     /*
5022      *  group()
5023      */
5024     {
5025         UChar    text1[80];
5026         UText   *actual;
5027         UBool    result;
5028         int64_t  length = 0;
5029
5030         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5031         //                  012345678901234567890123456789012345678901234567
5032         //                  0         1         2         3         4
5033
5034         status = U_ZERO_ERROR;
5035         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5036         REGEX_CHECK_STATUS;
5037
5038         uregex_setText(re, text1, -1, &status);
5039         result = uregex_find(re, 0, &status);
5040         REGEX_ASSERT(result==TRUE);
5041
5042         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5043         status = U_ZERO_ERROR;
5044         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5045         REGEX_CHECK_STATUS;
5046         REGEX_ASSERT(actual == &bufferText);
5047         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5048         REGEX_ASSERT(length == 16);
5049         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5050
5051         /*  Capture group #1.  Should succeed, matching " interior ". */
5052         status = U_ZERO_ERROR;
5053         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5054         REGEX_CHECK_STATUS;
5055         REGEX_ASSERT(actual == &bufferText);
5056         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5057         REGEX_ASSERT(length == 10);
5058         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5059
5060         /*  Capture group out of range.  Error. */
5061         status = U_ZERO_ERROR;
5062         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5063         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5064         REGEX_ASSERT(actual == &bufferText);
5065         uregex_close(re);
5066
5067     }
5068
5069     /*
5070      *  replaceFirst()
5071      */
5072     {
5073         UChar    text1[80];
5074         UChar    text2[80];
5075         UText    replText = UTEXT_INITIALIZER;
5076         UText   *result;
5077         status = U_ZERO_ERROR;
5078         utext_openUnicodeString(&bufferText, &buffer, &status);
5079
5080         status = U_ZERO_ERROR;
5081         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5082         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5083         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5084
5085         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5086         REGEX_CHECK_STATUS;
5087
5088         /*  Normal case, with match */
5089         uregex_setText(re, text1, -1, &status);
5090         REGEX_CHECK_STATUS;
5091         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5092         REGEX_CHECK_STATUS;
5093         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5094         REGEX_CHECK_STATUS;
5095         REGEX_ASSERT(result == &bufferText);
5096         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5097
5098         /* No match.  Text should copy to output with no changes.  */
5099         uregex_setText(re, text2, -1, &status);
5100         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5101         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5102         REGEX_CHECK_STATUS;
5103         REGEX_ASSERT(result == &bufferText);
5104         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5105
5106         /* Unicode escapes */
5107         uregex_setText(re, text1, -1, &status);
5108         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5109         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5110         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5111         REGEX_CHECK_STATUS;
5112         REGEX_ASSERT(result == &bufferText);
5113         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5114
5115         uregex_close(re);
5116         utext_close(&replText);
5117     }
5118
5119
5120     /*
5121      *  replaceAll()
5122      */
5123     {
5124         UChar    text1[80];
5125         UChar    text2[80];
5126         UText    replText = UTEXT_INITIALIZER;
5127         UText   *result;
5128
5129         status = U_ZERO_ERROR;
5130         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5131         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5132         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5133
5134         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5135         REGEX_CHECK_STATUS;
5136
5137         /*  Normal case, with match */
5138         uregex_setText(re, text1, -1, &status);
5139         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5140         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5141         REGEX_CHECK_STATUS;
5142         REGEX_ASSERT(result == &bufferText);
5143         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5144
5145         /* No match.  Text should copy to output with no changes.  */
5146         uregex_setText(re, text2, -1, &status);
5147         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5148         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5149         REGEX_CHECK_STATUS;
5150         REGEX_ASSERT(result == &bufferText);
5151         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5152
5153         uregex_close(re);
5154         utext_close(&replText);
5155     }
5156
5157
5158     /*
5159      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5160      *   so we don't need to test it here.
5161      */
5162
5163     utext_close(&bufferText);
5164     utext_close(&patternText);
5165 }
5166
5167
5168 //--------------------------------------------------------------
5169 //
5170 //  NamedCapture   Check basic named capture group functionality
5171 //
5172 //--------------------------------------------------------------
5173 void RegexTest::NamedCapture() {
5174     UErrorCode status = U_ZERO_ERROR;
5175     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5176             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5177     REGEX_CHECK_STATUS;
5178     int32_t group = pat->groupNumberFromName("five", -1, status);
5179     REGEX_CHECK_STATUS;
5180     REGEX_ASSERT(5 == group);
5181     group = pat->groupNumberFromName("three", -1, status);
5182     REGEX_CHECK_STATUS;
5183     REGEX_ASSERT(3 == group);
5184
5185     status = U_ZERO_ERROR;
5186     group = pat->groupNumberFromName(UnicodeString("six"), status);
5187     REGEX_CHECK_STATUS;
5188     REGEX_ASSERT(6 == group);
5189
5190     status = U_ZERO_ERROR;
5191     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5192     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5193
5194     status = U_ZERO_ERROR;
5195
5196     // After copying a pattern, named capture should still work in the copy.
5197     RegexPattern *copiedPat = new RegexPattern(*pat);
5198     REGEX_ASSERT(*copiedPat == *pat);
5199     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5200
5201     group = copiedPat->groupNumberFromName("five", -1, status);
5202     REGEX_CHECK_STATUS;
5203     REGEX_ASSERT(5 == group);
5204     group = copiedPat->groupNumberFromName("three", -1, status);
5205     REGEX_CHECK_STATUS;
5206     REGEX_ASSERT(3 == group);
5207     delete copiedPat;
5208
5209     // ReplaceAll with named capture group.
5210     status = U_ZERO_ERROR;
5211     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5212     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5213     REGEX_CHECK_STATUS;
5214     // m.pattern().dumpPattern();
5215     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5216     REGEX_CHECK_STATUS;
5217     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5218     delete m;
5219
5220     // ReplaceAll, allowed capture group numbers.
5221     text = UnicodeString("abcmxyz");
5222     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5223     REGEX_CHECK_STATUS;
5224
5225     status = U_ZERO_ERROR;
5226     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5227     REGEX_CHECK_STATUS;
5228     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5229
5230     status = U_ZERO_ERROR;
5231     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5232     REGEX_CHECK_STATUS;
5233     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5234
5235     status = U_ZERO_ERROR;
5236     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5237     REGEX_CHECK_STATUS;
5238     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5239
5240     status = U_ZERO_ERROR;
5241     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5242     REGEX_CHECK_STATUS;
5243     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5244
5245     status = U_ZERO_ERROR;
5246     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5247     REGEX_CHECK_STATUS;
5248     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5249
5250     status = U_ZERO_ERROR;
5251     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5252     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5253
5254     status = U_ZERO_ERROR;
5255     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5256     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5257     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5258
5259     status = U_ZERO_ERROR;
5260     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5261     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5262     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5263
5264     status = U_ZERO_ERROR;
5265     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5266     REGEX_CHECK_STATUS;
5267     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5268
5269     status = U_ZERO_ERROR;
5270     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5271     REGEX_CHECK_STATUS;
5272     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5273
5274     status = U_ZERO_ERROR;
5275     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5276     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5277
5278     status = U_ZERO_ERROR;
5279     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5280     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5281
5282     status = U_ZERO_ERROR;
5283     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5284     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5285
5286     status = U_ZERO_ERROR;
5287     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5288     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5289
5290     delete m;
5291
5292     // Repeat the above replaceAll() tests using the plain C API, which
5293     //  has a separate implementation internally.
5294     //  TODO: factor out the test data.
5295
5296     status = U_ZERO_ERROR;
5297     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5298     REGEX_CHECK_STATUS;
5299     text = UnicodeString("abcmxyz");
5300     uregex_setText(re, text.getBuffer(), text.length(), &status);
5301     REGEX_CHECK_STATUS;
5302
5303     UChar resultBuf[100];
5304     int32_t resultLength;
5305     UnicodeString repl;
5306
5307     status = U_ZERO_ERROR;
5308     repl = UnicodeString("<$0>");
5309     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5310     REGEX_CHECK_STATUS;
5311     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5312
5313     status = U_ZERO_ERROR;
5314     repl = UnicodeString("<$1>");
5315     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5316     REGEX_CHECK_STATUS;
5317     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5318
5319     status = U_ZERO_ERROR;
5320     repl = UnicodeString("<${one}>");
5321     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5322     REGEX_CHECK_STATUS;
5323     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5324
5325     status = U_ZERO_ERROR;
5326     repl = UnicodeString("<$2>");
5327     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5328     REGEX_CHECK_STATUS;
5329     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5330
5331     status = U_ZERO_ERROR;
5332     repl = UnicodeString("<$3>");
5333     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5334     REGEX_CHECK_STATUS;
5335     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5336
5337     status = U_ZERO_ERROR;
5338     repl = UnicodeString("<$4>");
5339     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5340     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5341
5342     status = U_ZERO_ERROR;
5343     repl = UnicodeString("<$04>");
5344     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5345     REGEX_CHECK_STATUS;
5346     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5347
5348     status = U_ZERO_ERROR;
5349     repl = UnicodeString("<$000016>");
5350     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5351     REGEX_CHECK_STATUS;
5352     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5353
5354     status = U_ZERO_ERROR;
5355     repl = UnicodeString("<$3$2$1${one}>");
5356     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5357     REGEX_CHECK_STATUS;
5358     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5359
5360     status = U_ZERO_ERROR;
5361     repl = UnicodeString("$3$2$1${one}");
5362     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5363     REGEX_CHECK_STATUS;
5364     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5365
5366     status = U_ZERO_ERROR;
5367     repl = UnicodeString("<${noSuchName}>");
5368     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5369     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5370
5371     status = U_ZERO_ERROR;
5372     repl = UnicodeString("<${invalid-name}>");
5373     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5375
5376     status = U_ZERO_ERROR;
5377     repl = UnicodeString("<${one");
5378     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5379     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5380
5381     status = U_ZERO_ERROR;
5382     repl = UnicodeString("$not a capture group");
5383     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5384     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5385
5386     uregex_close(re);
5387 }
5388
5389 //--------------------------------------------------------------
5390 //
5391 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5392 //                       The point is not so much what the exact limit is,
5393 //                       but that a largish number doesn't hit bad non-linear performance,
5394 //                       and that exceeding the limit fails cleanly.
5395 //
5396 //--------------------------------------------------------------
5397 void RegexTest::NamedCaptureLimits() {
5398     if (quick) {
5399         logln("Skipping test. Runs in exhuastive mode only.");
5400         return;
5401     }
5402     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5403     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5404     char nnbuf[100];
5405     UnicodeString pattern;
5406     int32_t nn;
5407
5408     for (nn=1; nn<goodLimit; nn++) {
5409         sprintf(nnbuf, "(?<nn%d>)", nn);
5410         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5411     }
5412     UErrorCode status = U_ZERO_ERROR;
5413     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5414     REGEX_CHECK_STATUS;
5415     for (nn=1; nn<goodLimit; nn++) {
5416         sprintf(nnbuf, "nn%d", nn);
5417         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5418         REGEX_ASSERT(nn == groupNum);
5419         if (nn != groupNum) {
5420             break;
5421         }
5422     }
5423     delete pat;
5424
5425     pattern.remove();
5426     for (nn=1; nn<failLimit; nn++) {
5427         sprintf(nnbuf, "(?<nn%d>)", nn);
5428         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5429     }
5430     status = U_ZERO_ERROR;
5431     pat = RegexPattern::compile(pattern, 0, status);
5432     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5433     delete pat;
5434 }
5435
5436
5437 //--------------------------------------------------------------
5438 //
5439 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5440 //
5441 //---------------------------------------------------------------
5442 void RegexTest::Bug7651() {
5443     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5444     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5445     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5446     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5447     UnicodeString s("#ff @abcd This is test");
5448     RegexPattern  *REPattern = NULL;
5449     RegexMatcher  *REMatcher = NULL;
5450     UErrorCode status = U_ZERO_ERROR;
5451     UParseError pe;
5452
5453     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5454     REGEX_CHECK_STATUS;
5455     REMatcher = REPattern->matcher(s, status);
5456     REGEX_CHECK_STATUS;
5457     REGEX_ASSERT(REMatcher->find());
5458     REGEX_ASSERT(REMatcher->start(status) == 0);
5459     delete REPattern;
5460     delete REMatcher;
5461     status = U_ZERO_ERROR;
5462
5463     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5464     REGEX_CHECK_STATUS;
5465     REMatcher = REPattern->matcher(s, status);
5466     REGEX_CHECK_STATUS;
5467     REGEX_ASSERT(REMatcher->find());
5468     REGEX_ASSERT(REMatcher->start(status) == 0);
5469     delete REPattern;
5470     delete REMatcher;
5471     status = U_ZERO_ERROR;
5472  }
5473
5474 void RegexTest::Bug7740() {
5475     UErrorCode status = U_ZERO_ERROR;
5476     UnicodeString pattern = "(a)";
5477     UnicodeString text = "abcdef";
5478     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5479     REGEX_CHECK_STATUS;
5480     REGEX_ASSERT(m->lookingAt(status));
5481     REGEX_CHECK_STATUS;
5482     status = U_ILLEGAL_ARGUMENT_ERROR;
5483     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5484     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5485     REGEX_ASSERT(s == "");
5486     delete m;
5487 }
5488
5489 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5490
5491 void RegexTest::Bug8479() {
5492     UErrorCode status = U_ZERO_ERROR;
5493
5494     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5495     REGEX_CHECK_STATUS;
5496     if (U_SUCCESS(status))
5497     {
5498         UnicodeString str;
5499         str.setToBogus();
5500         pMatcher->reset(str);
5501         status = U_ZERO_ERROR;
5502         pMatcher->matches(status);
5503         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5504         delete pMatcher;
5505     }
5506 }
5507
5508
5509 // Bug 7029
5510 void RegexTest::Bug7029() {
5511     UErrorCode status = U_ZERO_ERROR;
5512
5513     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5514     UnicodeString text = "abc.def";
5515     UnicodeString splits[10];
5516     REGEX_CHECK_STATUS;
5517     int32_t numFields = pMatcher->split(text, splits, 10, status);
5518     REGEX_CHECK_STATUS;
5519     REGEX_ASSERT(numFields == 8);
5520     delete pMatcher;
5521 }
5522
5523 // Bug 9283
5524 //   This test is checking for the existance of any supplemental characters that case-fold
5525 //   to a bmp character.
5526 //
5527 //   At the time of this writing there are none. If any should appear in a subsequent release
5528 //   of Unicode, the code in regular expressions compilation that determines the longest
5529 //   posssible match for a literal string  will need to be enhanced.
5530 //
5531 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5532 //   for details on what to do in case of a failure of this test.
5533 //
5534 void RegexTest::Bug9283() {
5535 #if !UCONFIG_NO_NORMALIZATION
5536     UErrorCode status = U_ZERO_ERROR;
5537     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5538     REGEX_CHECK_STATUS;
5539     int32_t index;
5540     UChar32 c;
5541     for (index=0; ; index++) {
5542         c = supplementalsWithCaseFolding.charAt(index);
5543         if (c == -1) {
5544             break;
5545         }
5546         UnicodeString cf = UnicodeString(c).foldCase();
5547         REGEX_ASSERT(cf.length() >= 2);
5548     }
5549 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5550 }
5551
5552
5553 void RegexTest::CheckInvBufSize() {
5554   if(inv_next>=INV_BUFSIZ) {
5555     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5556           __FILE__, INV_BUFSIZ, inv_next);
5557   } else {
5558     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5559   }
5560 }
5561
5562
5563 void RegexTest::Bug10459() {
5564     UErrorCode status = U_ZERO_ERROR;
5565     UnicodeString patternString("(txt)");
5566     UnicodeString txtString("txt");
5567
5568     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5569     REGEX_CHECK_STATUS;
5570     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5571     REGEX_CHECK_STATUS;
5572
5573     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5574     REGEX_CHECK_STATUS;
5575
5576     uregex_setUText(icu_re, utext_txt, &status);
5577     REGEX_CHECK_STATUS;
5578
5579     // The bug was that calling uregex_group() before doing a matching operation
5580     //   was causing a segfault. Only for Regular Expressions created from UText.
5581     //   It should set an U_REGEX_INVALID_STATE.
5582
5583     UChar buf[100];
5584     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5585     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5586     REGEX_ASSERT(len == 0);
5587
5588     uregex_close(icu_re);
5589     utext_close(utext_pat);
5590     utext_close(utext_txt);
5591 }
5592
5593 void RegexTest::TestCaseInsensitiveStarters() {
5594     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5595     //  become stale because of new Unicode characters.
5596     // If it is stale, rerun the generation tool
5597     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5598     // and replace the embedded data in i18n/regexcmp.cpp
5599
5600     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5601         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5602             continue;
5603         }
5604         UnicodeSet s(cp, cp);
5605         s.closeOver(USET_CASE_INSENSITIVE);
5606         UnicodeSetIterator setIter(s);
5607         while (setIter.next()) {
5608             if (!setIter.isString()) {
5609                 continue;
5610             }
5611             const UnicodeString &str = setIter.getString();
5612             UChar32 firstChar = str.char32At(0);
5613             UnicodeSet starters;
5614             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5615             if (!starters.contains(cp)) {
5616                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5617                 return;
5618             }
5619         }
5620     }
5621 }
5622
5623
5624 void RegexTest::TestBug11049() {
5625     // Original bug report: pattern with match start consisting of one of several individual characters,
5626     //  and the text being matched ending with a supplementary character. find() would read past the
5627     //  end of the input text when searching for potential match starting points.
5628
5629     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5630     // detect the bad read.
5631
5632     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5633     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5634
5635     // Test again with a pattern starting with a single character,
5636     // which takes a different code path than starting with an OR expression,
5637     // but with similar logic.
5638     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5639     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5640 }
5641
5642 // Run a single test case from TestBug11049(). Internal function.
5643 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5644     UErrorCode status = U_ZERO_ERROR;
5645     UnicodeString patternString = UnicodeString(pattern).unescape();
5646     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5647
5648     UnicodeString dataString = UnicodeString(data).unescape();
5649     UChar *exactBuffer = new UChar[dataString.length()];
5650     dataString.extract(exactBuffer, dataString.length(), status);
5651     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5652
5653     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5654     REGEX_CHECK_STATUS;
5655     matcher->reset(ut);
5656     UBool result = matcher->find();
5657     if (result != expectMatch) {
5658         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5659               __FILE__, lineNumber, expectMatch, result, pattern, data);
5660     }
5661
5662     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5663     //   off-by-one on find() with match at the last code point.
5664     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5665     //   because string.unescape() will only shrink it.
5666     char * utf8Buffer = new char[uprv_strlen(data)+1];
5667     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5668     REGEX_CHECK_STATUS;
5669     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5670     REGEX_CHECK_STATUS;
5671     matcher->reset(ut);
5672     result = matcher->find();
5673     if (result != expectMatch) {
5674         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5675               __FILE__, lineNumber, expectMatch, result, pattern, data);
5676     }
5677     delete [] utf8Buffer;
5678
5679     utext_close(ut);
5680     delete [] exactBuffer;
5681 }
5682
5683
5684 void RegexTest::TestBug11371() {
5685     if (quick) {
5686         logln("Skipping test. Runs in exhuastive mode only.");
5687         return;
5688     }
5689     UErrorCode status = U_ZERO_ERROR;
5690     UnicodeString patternString;
5691
5692     for (int i=0; i<8000000; i++) {
5693         patternString.append(UnicodeString("()"));
5694     }
5695     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5696     if (status != U_REGEX_PATTERN_TOO_BIG) {
5697         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5698               __FILE__, __LINE__, u_errorName(status));
5699     }
5700
5701     status = U_ZERO_ERROR;
5702     patternString = "(";
5703     for (int i=0; i<20000000; i++) {
5704         patternString.append(UnicodeString("A++"));
5705     }
5706     patternString.append(UnicodeString("){0}B++"));
5707     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5708     if (status != U_REGEX_PATTERN_TOO_BIG) {
5709         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5710               __FILE__, __LINE__, u_errorName(status));
5711     }
5712
5713     // Pattern with too much string data, such that string indexes overflow operand data field size
5714     // in compiled instruction.
5715     status = U_ZERO_ERROR;
5716     patternString = "";
5717     while (patternString.length() < 0x00ffffff) {
5718         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5719     }
5720     patternString.append(UnicodeString("X? trailing string"));
5721     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5722     if (status != U_REGEX_PATTERN_TOO_BIG) {
5723         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5724               __FILE__, __LINE__, u_errorName(status));
5725     }
5726 }
5727
5728 void RegexTest::TestBug11480() {
5729     // C API, get capture group of a group that does not participate in the match.
5730     //        (Returns a zero length string, with nul termination,
5731     //         indistinguishable from a group with a zero length match.)
5732
5733     UErrorCode status = U_ZERO_ERROR;
5734     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5735     REGEX_CHECK_STATUS;
5736     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5737     uregex_setText(re, text.getBuffer(), text.length(), &status);
5738     REGEX_CHECK_STATUS;
5739     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5740     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5741     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5742     REGEX_ASSERT(length == 0);
5743     REGEX_ASSERT(buf[0] == 13);
5744     REGEX_ASSERT(buf[1] == 0);
5745     REGEX_ASSERT(buf[2] == 13);
5746     uregex_close(re);
5747
5748     // UText C++ API, length of match is 0 for non-participating matches.
5749     UText ut = UTEXT_INITIALIZER;
5750     utext_openUnicodeString(&ut, &text, &status);
5751     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5752     REGEX_CHECK_STATUS;
5753     matcher.reset(&ut);
5754     REGEX_ASSERT(matcher.lookingAt(0, status));
5755
5756     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5757     int64_t groupLen = -666;
5758     UText group = UTEXT_INITIALIZER;
5759     matcher.group(1, &group, groupLen, status);
5760     REGEX_CHECK_STATUS;
5761     REGEX_ASSERT(groupLen == 1);
5762     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5763
5764     // Capture group 2, the (B), does not participate in the match.
5765     matcher.group(2, &group, groupLen, status);
5766     REGEX_CHECK_STATUS;
5767     REGEX_ASSERT(groupLen == 0);
5768     REGEX_ASSERT(matcher.start(2, status) == -1);
5769     REGEX_CHECK_STATUS;
5770 }
5771
5772 void RegexTest::TestBug12884() {
5773     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5774     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5775     UnicodeString text(u"hello");
5776     UErrorCode status = U_ZERO_ERROR;
5777     RegexMatcher m(pattern, text, 0, status);
5778     REGEX_CHECK_STATUS;
5779     m.setTimeLimit(5, status);
5780     m.find(status);
5781     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5782
5783     // Non-greedy loops. They take a different code path during matching.
5784     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5785     status = U_ZERO_ERROR;
5786     RegexMatcher ngM(ngPattern, text, 0, status);
5787     REGEX_CHECK_STATUS;
5788     ngM.setTimeLimit(5, status);
5789     ngM.find(status);
5790     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5791
5792     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5793     const char *text8 = u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5794                           "carácter, sin importar la plataforma, sin importar el programa,"
5795                           "sin importar el idioma.";
5796     status = U_ZERO_ERROR;
5797     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5798     REGEX_CHECK_STATUS;
5799     m.reset(ut.getAlias());
5800     m.find(status);
5801     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5802
5803     status = U_ZERO_ERROR;
5804     ngM.reset(ut.getAlias());
5805     ngM.find(status);
5806     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5807 }
5808
5809 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */