icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2016, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 /*
  14      NOTE!!
  15
  16      PLEASE be careful about ASCII assumptions in this test.
  17      This test is one of the worst repeat offenders.
  18      If you have questions, contact someone on the ICU PMC
  19      who has access to an EBCDIC system.
  20
  21  */
  22
  23 #include "intltest.h"
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  25
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29
  30 #include "unicode/localpointer.h"
  31 #include "unicode/regex.h"
  32 #include "unicode/uchar.h"
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uniset.h"
  35 #include "unicode/uregex.h"
  36 #include "unicode/usetiter.h"
  37 #include "unicode/ustring.h"
  38 #include "unicode/utext.h"
  39
  40 #include "regextst.h"
  41 #include "regexcmp.h"
  42 #include "uvector.h"
  43 #include "util.h"
  44 #include "cmemory.h"
  45 #include "cstring.h"
  46 #include "uinvchar.h"
  47
  48 #define SUPPORT_MUTATING_INPUT_STRING   0
  49
  50 //---------------------------------------------------------------------------
  51 //
  52 //  Test class boilerplate
  53 //
  54 //---------------------------------------------------------------------------
  55 RegexTest::RegexTest()
  56 {
  57 }
  58
  59
  60 RegexTest::~RegexTest()
  61 {
  62 }
  63
  64
  65
  66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  67 {
  68     if (exec) logln("TestSuite RegexTest: ");
  69     switch (index) {
  70
  71         case 0: name = "Basic";
  72             if (exec) Basic();
  73             break;
  74         case 1: name = "API_Match";
  75             if (exec) API_Match();
  76             break;
  77         case 2: name = "API_Replace";
  78             if (exec) API_Replace();
  79             break;
  80         case 3: name = "API_Pattern";
  81             if (exec) API_Pattern();
  82             break;
  83         case 4:
  84 #if !UCONFIG_NO_FILE_IO
  85             name = "Extended";
  86             if (exec) Extended();
  87 #else
  88             name = "skip";
  89 #endif
  90             break;
  91         case 5: name = "Errors";
  92             if (exec) Errors();
  93             break;
  94         case 6: name = "PerlTests";
  95             if (exec) PerlTests();
  96             break;
  97         case 7: name = "Callbacks";
  98             if (exec) Callbacks();
  99             break;
 100         case 8: name = "FindProgressCallbacks";
 101             if (exec) FindProgressCallbacks();
 102             break;
 103         case 9: name = "Bug 6149";
 104              if (exec) Bug6149();
 105              break;
 106         case 10: name = "UTextBasic";
 107           if (exec) UTextBasic();
 108           break;
 109         case 11: name = "API_Match_UTF8";
 110           if (exec) API_Match_UTF8();
 111           break;
 112         case 12: name = "API_Replace_UTF8";
 113           if (exec) API_Replace_UTF8();
 114           break;
 115         case 13: name = "API_Pattern_UTF8";
 116           if (exec) API_Pattern_UTF8();
 117           break;
 118         case 14: name = "PerlTestsUTF8";
 119           if (exec) PerlTestsUTF8();
 120           break;
 121         case 15: name = "PreAllocatedUTextCAPI";
 122           if (exec) PreAllocatedUTextCAPI();
 123           break;
 124         case 16: name = "Bug 7651";
 125              if (exec) Bug7651();
 126              break;
 127         case 17: name = "Bug 7740";
 128             if (exec) Bug7740();
 129             break;
 130         case 18: name = "Bug 8479";
 131             if (exec) Bug8479();
 132             break;
 133         case 19: name = "Bug 7029";
 134             if (exec) Bug7029();
 135             break;
 136         case 20: name = "CheckInvBufSize";
 137             if (exec) CheckInvBufSize();
 138             break;
 139         case 21: name = "Bug 9283";
 140             if (exec) Bug9283();
 141             break;
 142         case 22: name = "Bug10459";
 143             if (exec) Bug10459();
 144             break;
 145         case 23: name = "TestCaseInsensitiveStarters";
 146             if (exec) TestCaseInsensitiveStarters();
 147             break;
 148         case 24: name = "TestBug11049";
 149             if (exec) TestBug11049();
 150             break;
 151         case 25: name = "TestBug11371";
 152             if (exec) TestBug11371();
 153             break;
 154         case 26: name = "TestBug11480";
 155             if (exec) TestBug11480();
 156             break;
 157         case 27: name = "NamedCapture";
 158             if (exec) NamedCapture();
 159             break;
 160         case 28: name = "NamedCaptureLimits";
 161             if (exec) NamedCaptureLimits();
 162             break;
 163         default: name = "";
 164             break; //needed to end loop
 165     }
 166 }
 167
 168
 169
 170 /**
 171  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 172  * into ASCII.
 173  * @see utext_openUTF8
 174  */
 175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 176
 177 //---------------------------------------------------------------------------
 178 //
 179 //   Error Checking / Reporting macros used in all of the tests.
 180 //
 181 //---------------------------------------------------------------------------
 182
 183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 184   int64_t oldIndex = utext_getNativeIndex(text);
 185   utext_setNativeIndex(text, 0);
 186   char *bufPtr = buf;
 187   UChar32 c = utext_next32From(text, 0);
 188   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 189     if (0x000020<=c && c<0x00007e) {
 190       *bufPtr = c;
 191     } else {
 192 #if 0
 193       sprintf(bufPtr,"U+%04X", c);
 194       bufPtr+= strlen(bufPtr)-1;
 195 #else
 196       *bufPtr = '%';
 197 #endif
 198     }
 199     bufPtr++;
 200     c = UTEXT_NEXT32(text);
 201   }
 202   *bufPtr = 0;
 203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 204   char *ebuf = (char*)malloc(bufLen);
 205   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 206   uprv_strncpy(buf, ebuf, bufLen);
 207   free((void*)ebuf);
 208 #endif
 209   utext_setNativeIndex(text, oldIndex);
 210 }
 211
 212
 213 static char ASSERT_BUF[1024];
 214
 215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 216   if(message.length()==0) {
 217     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 218   } else {
 219     UnicodeString buf;
 220     IntlTest::prettify(message,buf);
 221     if(buf.length()==0) {
 222       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 223     } else {
 224       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 225       if(ASSERT_BUF[0]==0) {
 226         ASSERT_BUF[0]=0;
 227         for(int32_t i=0;i<buf.length();i++) {
 228           UChar ch = buf[i];
 229           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 230         }
 231       }
 232     }
 233   }
 234   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 235   return ASSERT_BUF;
 236 }
 237
 238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 239
 240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 241                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 242
 243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 244
 245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 246 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 247     __LINE__, u_errorName(errcode), u_errorName(status));};}
 248
 249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 250     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 251
 252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 253     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 254
 255 // expected: const char * , restricted to invariant characters.
 256 // actual: const UnicodeString &
 257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
 258     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
 259         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
 260                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
 261
 262
 263 static UBool testUTextEqual(UText *uta, UText *utb) {
 264     UChar32 ca = 0;
 265     UChar32 cb = 0;
 266     utext_setNativeIndex(uta, 0);
 267     utext_setNativeIndex(utb, 0);
 268     do {
 269         ca = utext_next32(uta);
 270         cb = utext_next32(utb);
 271         if (ca != cb) {
 272             break;
 273         }
 274     } while (ca != U_SENTINEL);
 275     return ca == cb;
 276 }
 277
 278
 279 /**
 280  * @param expected expected text in UTF-8 (not platform) codepage
 281  */
 282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 283     UErrorCode status = U_ZERO_ERROR;
 284     UText expectedText = UTEXT_INITIALIZER;
 285     utext_openUTF8(&expectedText, expected, -1, &status);
 286     if(U_FAILURE(status)) {
 287       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 288       return;
 289     }
 290     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 291       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 292       return;
 293     }
 294     utext_setNativeIndex(actual, 0);
 295     if (!testUTextEqual(&expectedText, actual)) {
 296         char buf[201 /*21*/];
 297         char expectedBuf[201];
 298         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 299         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 300         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 301     }
 302     utext_close(&expectedText);
 303 }
 304 /**
 305  * @param expected invariant (platform local text) input
 306  */
 307
 308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 309     UErrorCode status = U_ZERO_ERROR;
 310     UText expectedText = UTEXT_INITIALIZER;
 311     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 312     if(U_FAILURE(status)) {
 313       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 314       return;
 315     }
 316     utext_setNativeIndex(actual, 0);
 317     if (!testUTextEqual(&expectedText, actual)) {
 318         char buf[201 /*21*/];
 319         char expectedBuf[201];
 320         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
 321         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
 322         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 323     }
 324     utext_close(&expectedText);
 325 }
 326
 327 /**
 328  * Assumes utf-8 input
 329  */
 330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 331 /**
 332  * Assumes Invariant input
 333  */
 334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 335
 336 /**
 337  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 338  * passed into utext_openUTF8. An error will be given if
 339  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 340  */
 341
 342 #define INV_BUFSIZ 2048 /* increase this if too small */
 343
 344 static int64_t inv_next=0;
 345
 346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 347 static char inv_buf[INV_BUFSIZ];
 348 #endif
 349
 350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 351   if(length==-1) length=strlen(inv);
 352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 353   inv_next+=length;
 354   return utext_openUTF8(ut, inv, length, status);
 355 #else
 356   if(inv_next+length+1>INV_BUFSIZ) {
 357     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 358             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 359     *status = U_MEMORY_ALLOCATION_ERROR;
 360     return NULL;
 361   }
 362
 363   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 364   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 365   inv_next+=length;
 366
 367 #if 0
 368   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 369 #endif
 370
 371   return utext_openUTF8(ut, (const char*)buf, length, status);
 372 #endif
 373 }
 374
 375
 376 //---------------------------------------------------------------------------
 377 //
 378 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 379 //                       for the LookingAt() and  Match() functions.
 380 //
 381 //       usage:
 382 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 383 //
 384 //          The expected results are UBool - TRUE or FALSE.
 385 //          The input text is unescaped.  The pattern is not.
 386 //
 387 //
 388 //---------------------------------------------------------------------------
 389
 390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 391
 392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 393     const UnicodeString pattern(pat, -1, US_INV);
 394     const UnicodeString inputText(text, -1, US_INV);
 395     UErrorCode          status  = U_ZERO_ERROR;
 396     UParseError         pe;
 397     RegexPattern        *REPattern = NULL;
 398     RegexMatcher        *REMatcher = NULL;
 399     UBool               retVal     = TRUE;
 400
 401     UnicodeString patString(pat, -1, US_INV);
 402     REPattern = RegexPattern::compile(patString, 0, pe, status);
 403     if (U_FAILURE(status)) {
 404         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 405             line, u_errorName(status));
 406         return FALSE;
 407     }
 408     if (line==376) { REPattern->dumpPattern();}
 409
 410     UnicodeString inputString(inputText);
 411     UnicodeString unEscapedInput = inputString.unescape();
 412     REMatcher = REPattern->matcher(unEscapedInput, status);
 413     if (U_FAILURE(status)) {
 414         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 415             line, u_errorName(status));
 416         return FALSE;
 417     }
 418
 419     UBool actualmatch;
 420     actualmatch = REMatcher->lookingAt(status);
 421     if (U_FAILURE(status)) {
 422         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 423             line, u_errorName(status));
 424         retVal =  FALSE;
 425     }
 426     if (actualmatch != looking) {
 427         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 428         retVal = FALSE;
 429     }
 430
 431     status = U_ZERO_ERROR;
 432     actualmatch = REMatcher->matches(status);
 433     if (U_FAILURE(status)) {
 434         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 435             line, u_errorName(status));
 436         retVal = FALSE;
 437     }
 438     if (actualmatch != match) {
 439         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 440         retVal = FALSE;
 441     }
 442
 443     if (retVal == FALSE) {
 444         REPattern->dumpPattern();
 445     }
 446
 447     delete REPattern;
 448     delete REMatcher;
 449     return retVal;
 450 }
 451
 452
 453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 454     UText               pattern    = UTEXT_INITIALIZER;
 455     int32_t             inputUTF8Length;
 456     char                *textChars = NULL;
 457     UText               inputText  = UTEXT_INITIALIZER;
 458     UErrorCode          status     = U_ZERO_ERROR;
 459     UParseError         pe;
 460     RegexPattern        *REPattern = NULL;
 461     RegexMatcher        *REMatcher = NULL;
 462     UBool               retVal     = TRUE;
 463
 464     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 465     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 466     if (U_FAILURE(status)) {
 467         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 468             line, u_errorName(status));
 469         return FALSE;
 470     }
 471
 472     UnicodeString inputString(text, -1, US_INV);
 473     UnicodeString unEscapedInput = inputString.unescape();
 474     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 475     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 476
 477     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 478     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 479         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 480         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 481         return TRUE; // not a failure of the Regex engine
 482     }
 483     status = U_ZERO_ERROR; // buffer overflow
 484     textChars = new char[inputUTF8Length+1];
 485     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 486     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 487
 488     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 489     if (U_FAILURE(status)) {
 490         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 491             line, u_errorName(status));
 492         return FALSE;
 493     }
 494
 495     UBool actualmatch;
 496     actualmatch = REMatcher->lookingAt(status);
 497     if (U_FAILURE(status)) {
 498         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 499             line, u_errorName(status));
 500         retVal =  FALSE;
 501     }
 502     if (actualmatch != looking) {
 503         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 504         retVal = FALSE;
 505     }
 506
 507     status = U_ZERO_ERROR;
 508     actualmatch = REMatcher->matches(status);
 509     if (U_FAILURE(status)) {
 510         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 511             line, u_errorName(status));
 512         retVal = FALSE;
 513     }
 514     if (actualmatch != match) {
 515         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 516         retVal = FALSE;
 517     }
 518
 519     if (retVal == FALSE) {
 520         REPattern->dumpPattern();
 521     }
 522
 523     delete REPattern;
 524     delete REMatcher;
 525     utext_close(&inputText);
 526     utext_close(&pattern);
 527     delete[] textChars;
 528     return retVal;
 529 }
 530
 531
 532
 533 //---------------------------------------------------------------------------
 534 //
 535 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 536 //                       regex tests for incorrect patterns
 537 //
 538 //       usage:
 539 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 540 //
 541 //---------------------------------------------------------------------------
 542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 543
 544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 545                           UErrorCode expectedStatus, int32_t line) {
 546     UnicodeString       pattern(pat);
 547
 548     UErrorCode          status         = U_ZERO_ERROR;
 549     UParseError         pe;
 550     RegexPattern        *callerPattern = NULL;
 551
 552     //
 553     //  Compile the caller's pattern
 554     //
 555     UnicodeString patString(pat);
 556     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 557     if (status != expectedStatus) {
 558         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 559     } else {
 560         if (status != U_ZERO_ERROR) {
 561             if (pe.line != errLine || pe.offset != errCol) {
 562                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 563                     line, errLine, errCol, pe.line, pe.offset);
 564             }
 565         }
 566     }
 567
 568     delete callerPattern;
 569
 570     //
 571     //  Compile again, using a UTF-8-based UText
 572     //
 573     UText patternText = UTEXT_INITIALIZER;
 574     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 575     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 576     if (status != expectedStatus) {
 577         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 578     } else {
 579         if (status != U_ZERO_ERROR) {
 580             if (pe.line != errLine || pe.offset != errCol) {
 581                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 582                     line, errLine, errCol, pe.line, pe.offset);
 583             }
 584         }
 585     }
 586
 587     delete callerPattern;
 588     utext_close(&patternText);
 589 }
 590
 591
 592
 593 //---------------------------------------------------------------------------
 594 //
 595 //      Basic      Check for basic functionality of regex pattern matching.
 596 //                 Avoid the use of REGEX_FIND test macro, which has
 597 //                 substantial dependencies on basic Regex functionality.
 598 //
 599 //---------------------------------------------------------------------------
 600 void RegexTest::Basic() {
 601
 602
 603 //
 604 // Debug - slide failing test cases early
 605 //
 606 #if 0
 607     {
 608         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 609         UParseError pe;
 610         UErrorCode  status = U_ZERO_ERROR;
 611         RegexPattern *pattern;
 612         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 613         pattern->dumpPattern();
 614         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 615         UBool result = m->find();
 616         printf("result = %d\n", result);
 617         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 618         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 619     }
 620     exit(1);
 621 #endif
 622
 623
 624     //
 625     // Pattern with parentheses
 626     //
 627     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 628     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 629     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 630
 631     //
 632     // Patterns with *
 633     //
 634     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 635     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 636     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 637     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 638     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 639
 640     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 641     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 642
 643
 644     //
 645     //  Patterns with "."
 646     //
 647     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 648     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 649     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 650     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 651     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 652     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 653     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 654     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 655
 656     //
 657     //  Patterns with * applied to chars at end of literal string
 658     //
 659     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 660     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 661
 662     //
 663     //  Supplemental chars match as single chars, not a pair of surrogates.
 664     //
 665     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 666     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 667     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 668
 669
 670     //
 671     //  UnicodeSets in the pattern
 672     //
 673     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 674     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 675     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 676     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 677     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 678     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 679
 680     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 681     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 682     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 683     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 684     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 685
 686     //
 687     //   OR operator in patterns
 688     //
 689     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 690     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 691     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 692     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 693
 694     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 695     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 696     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 697     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 698     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 699     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 700
 701     //
 702     //  +
 703     //
 704     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 705     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 706     REGEX_TESTLM("b+", "", FALSE, FALSE);
 707     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 708     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 709     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 710
 711     //
 712     //   ?
 713     //
 714     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 715     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 716     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 717     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 718     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 719     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 720     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 721     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 722     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 723
 724     //
 725     //  Escape sequences that become single literal chars, handled internally
 726     //   by ICU's Unescape.
 727     //
 728
 729     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 730     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 731     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 732     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 733     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 734     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 735     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 736     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 737     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 738     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 739
 740     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 741     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 742
 743     // Escape of special chars in patterns
 744     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 745 }
 746
 747
 748 //---------------------------------------------------------------------------
 749 //
 750 //    UTextBasic   Check for quirks that are specific to the UText
 751 //                 implementation.
 752 //
 753 //---------------------------------------------------------------------------
 754 void RegexTest::UTextBasic() {
 755     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 756     UErrorCode status = U_ZERO_ERROR;
 757     UText pattern = UTEXT_INITIALIZER;
 758     utext_openUTF8(&pattern, str_abc, -1, &status);
 759     RegexMatcher matcher(&pattern, 0, status);
 760     REGEX_CHECK_STATUS;
 761
 762     UText input = UTEXT_INITIALIZER;
 763     utext_openUTF8(&input, str_abc, -1, &status);
 764     REGEX_CHECK_STATUS;
 765     matcher.reset(&input);
 766     REGEX_CHECK_STATUS;
 767     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 768
 769     matcher.reset(matcher.inputText());
 770     REGEX_CHECK_STATUS;
 771     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 772
 773     utext_close(&pattern);
 774     utext_close(&input);
 775 }
 776
 777
 778 //---------------------------------------------------------------------------
 779 //
 780 //      API_Match   Test that the API for class RegexMatcher
 781 //                  is present and nominally working, but excluding functions
 782 //                  implementing replace operations.
 783 //
 784 //---------------------------------------------------------------------------
 785 void RegexTest::API_Match() {
 786     UParseError         pe;
 787     UErrorCode          status=U_ZERO_ERROR;
 788     int32_t             flags = 0;
 789
 790     //
 791     // Debug - slide failing test cases early
 792     //
 793 #if 0
 794     {
 795     }
 796     return;
 797 #endif
 798
 799     //
 800     // Simple pattern compilation
 801     //
 802     {
 803         UnicodeString       re("abc");
 804         RegexPattern        *pat2;
 805         pat2 = RegexPattern::compile(re, flags, pe, status);
 806         REGEX_CHECK_STATUS;
 807
 808         UnicodeString inStr1 = "abcdef this is a test";
 809         UnicodeString instr2 = "not abc";
 810         UnicodeString empty  = "";
 811
 812
 813         //
 814         // Matcher creation and reset.
 815         //
 816         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 817         REGEX_CHECK_STATUS;
 818         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 819         REGEX_ASSERT(m1->input() == inStr1);
 820         m1->reset(instr2);
 821         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 822         REGEX_ASSERT(m1->input() == instr2);
 823         m1->reset(inStr1);
 824         REGEX_ASSERT(m1->input() == inStr1);
 825         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 826         m1->reset(empty);
 827         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 828         REGEX_ASSERT(m1->input() == empty);
 829         REGEX_ASSERT(&m1->pattern() == pat2);
 830
 831         //
 832         //  reset(pos, status)
 833         //
 834         m1->reset(inStr1);
 835         m1->reset(4, status);
 836         REGEX_CHECK_STATUS;
 837         REGEX_ASSERT(m1->input() == inStr1);
 838         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 839
 840         m1->reset(-1, status);
 841         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 842         status = U_ZERO_ERROR;
 843
 844         m1->reset(0, status);
 845         REGEX_CHECK_STATUS;
 846         status = U_ZERO_ERROR;
 847
 848         int32_t len = m1->input().length();
 849         m1->reset(len-1, status);
 850         REGEX_CHECK_STATUS;
 851         status = U_ZERO_ERROR;
 852
 853         m1->reset(len, status);
 854         REGEX_CHECK_STATUS;
 855         status = U_ZERO_ERROR;
 856
 857         m1->reset(len+1, status);
 858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 859         status = U_ZERO_ERROR;
 860
 861         //
 862         // match(pos, status)
 863         //
 864         m1->reset(instr2);
 865         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 866         m1->reset();
 867         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 868         m1->reset();
 869         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 870         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 871         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 872         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 873
 874         // Match() at end of string should fail, but should not
 875         //  be an error.
 876         status = U_ZERO_ERROR;
 877         len = m1->input().length();
 878         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 879         REGEX_CHECK_STATUS;
 880
 881         // Match beyond end of string should fail with an error.
 882         status = U_ZERO_ERROR;
 883         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 885
 886         // Successful match at end of string.
 887         {
 888             status = U_ZERO_ERROR;
 889             RegexMatcher m("A?", 0, status);  // will match zero length string.
 890             REGEX_CHECK_STATUS;
 891             m.reset(inStr1);
 892             len = inStr1.length();
 893             REGEX_ASSERT(m.matches(len, status) == TRUE);
 894             REGEX_CHECK_STATUS;
 895             m.reset(empty);
 896             REGEX_ASSERT(m.matches(0, status) == TRUE);
 897             REGEX_CHECK_STATUS;
 898         }
 899
 900
 901         //
 902         // lookingAt(pos, status)
 903         //
 904         status = U_ZERO_ERROR;
 905         m1->reset(instr2);  // "not abc"
 906         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 907         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 908         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 909         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 910         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 911         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 912         status = U_ZERO_ERROR;
 913         len = m1->input().length();
 914         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 915         REGEX_CHECK_STATUS;
 916         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 917         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 918
 919         delete m1;
 920         delete pat2;
 921     }
 922
 923
 924     //
 925     // Capture Group.
 926     //     RegexMatcher::start();
 927     //     RegexMatcher::end();
 928     //     RegexMatcher::groupCount();
 929     //
 930     {
 931         int32_t             flags=0;
 932         UParseError         pe;
 933         UErrorCode          status=U_ZERO_ERROR;
 934
 935         UnicodeString       re("01(23(45)67)(.*)");
 936         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 937         REGEX_CHECK_STATUS;
 938         UnicodeString data = "0123456789";
 939
 940         RegexMatcher *matcher = pat->matcher(data, status);
 941         REGEX_CHECK_STATUS;
 942         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 943         static const int32_t matchStarts[] = {0,  2, 4, 8};
 944         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 945         int32_t i;
 946         for (i=0; i<4; i++) {
 947             int32_t actualStart = matcher->start(i, status);
 948             REGEX_CHECK_STATUS;
 949             if (actualStart != matchStarts[i]) {
 950                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 951                     __LINE__, i, matchStarts[i], actualStart);
 952             }
 953             int32_t actualEnd = matcher->end(i, status);
 954             REGEX_CHECK_STATUS;
 955             if (actualEnd != matchEnds[i]) {
 956                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 957                     __LINE__, i, matchEnds[i], actualEnd);
 958             }
 959         }
 960
 961         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 962         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 963
 964         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 965         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 966         matcher->reset();
 967         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 968
 969         matcher->lookingAt(status);
 970         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 971         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 972         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 973         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 974         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 975         REGEX_CHECK_STATUS;
 976         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 977         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 978         matcher->reset();
 979         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 980
 981         delete matcher;
 982         delete pat;
 983
 984     }
 985
 986     //
 987     //  find
 988     //
 989     {
 990         int32_t             flags=0;
 991         UParseError         pe;
 992         UErrorCode          status=U_ZERO_ERROR;
 993
 994         UnicodeString       re("abc");
 995         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 996         REGEX_CHECK_STATUS;
 997         UnicodeString data = ".abc..abc...abc..";
 998         //                    012345678901234567
 999
1000         RegexMatcher *matcher = pat->matcher(data, status);
1001         REGEX_CHECK_STATUS;
1002         REGEX_ASSERT(matcher->find());
1003         REGEX_ASSERT(matcher->start(status) == 1);
1004         REGEX_ASSERT(matcher->find());
1005         REGEX_ASSERT(matcher->start(status) == 6);
1006         REGEX_ASSERT(matcher->find());
1007         REGEX_ASSERT(matcher->start(status) == 12);
1008         REGEX_ASSERT(matcher->find() == FALSE);
1009         REGEX_ASSERT(matcher->find() == FALSE);
1010
1011         matcher->reset();
1012         REGEX_ASSERT(matcher->find());
1013         REGEX_ASSERT(matcher->start(status) == 1);
1014
1015         REGEX_ASSERT(matcher->find(0, status));
1016         REGEX_ASSERT(matcher->start(status) == 1);
1017         REGEX_ASSERT(matcher->find(1, status));
1018         REGEX_ASSERT(matcher->start(status) == 1);
1019         REGEX_ASSERT(matcher->find(2, status));
1020         REGEX_ASSERT(matcher->start(status) == 6);
1021         REGEX_ASSERT(matcher->find(12, status));
1022         REGEX_ASSERT(matcher->start(status) == 12);
1023         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1024         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1025         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1026         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1027
1028         status = U_ZERO_ERROR;
1029         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1030         status = U_ZERO_ERROR;
1031         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032
1033         REGEX_ASSERT(matcher->groupCount() == 0);
1034
1035         delete matcher;
1036         delete pat;
1037     }
1038
1039
1040     //
1041     //  find, with \G in pattern (true if at the end of a previous match).
1042     //
1043     {
1044         int32_t             flags=0;
1045         UParseError         pe;
1046         UErrorCode          status=U_ZERO_ERROR;
1047
1048         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1049         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1050         REGEX_CHECK_STATUS;
1051         UnicodeString data = ".abcabc.abc..";
1052         //                    012345678901234567
1053
1054         RegexMatcher *matcher = pat->matcher(data, status);
1055         REGEX_CHECK_STATUS;
1056         REGEX_ASSERT(matcher->find());
1057         REGEX_ASSERT(matcher->start(status) == 0);
1058         REGEX_ASSERT(matcher->start(1, status) == -1);
1059         REGEX_ASSERT(matcher->start(2, status) == 1);
1060
1061         REGEX_ASSERT(matcher->find());
1062         REGEX_ASSERT(matcher->start(status) == 4);
1063         REGEX_ASSERT(matcher->start(1, status) == 4);
1064         REGEX_ASSERT(matcher->start(2, status) == -1);
1065         REGEX_CHECK_STATUS;
1066
1067         delete matcher;
1068         delete pat;
1069     }
1070
1071     //
1072     //   find with zero length matches, match position should bump ahead
1073     //     to prevent loops.
1074     //
1075     {
1076         int32_t                 i;
1077         UErrorCode          status=U_ZERO_ERROR;
1078         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1079                                                       //   using an always-true look-ahead.
1080         REGEX_CHECK_STATUS;
1081         UnicodeString s("    ");
1082         m.reset(s);
1083         for (i=0; ; i++) {
1084             if (m.find() == FALSE) {
1085                 break;
1086             }
1087             REGEX_ASSERT(m.start(status) == i);
1088             REGEX_ASSERT(m.end(status) == i);
1089         }
1090         REGEX_ASSERT(i==5);
1091
1092         // Check that the bump goes over surrogate pairs OK
1093         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094         s = s.unescape();
1095         m.reset(s);
1096         for (i=0; ; i+=2) {
1097             if (m.find() == FALSE) {
1098                 break;
1099             }
1100             REGEX_ASSERT(m.start(status) == i);
1101             REGEX_ASSERT(m.end(status) == i);
1102         }
1103         REGEX_ASSERT(i==10);
1104     }
1105     {
1106         // find() loop breaking test.
1107         //        with pattern of /.?/, should see a series of one char matches, then a single
1108         //        match of zero length at the end of the input string.
1109         int32_t                 i;
1110         UErrorCode          status=U_ZERO_ERROR;
1111         RegexMatcher        m(".?", 0, status);
1112         REGEX_CHECK_STATUS;
1113         UnicodeString s("    ");
1114         m.reset(s);
1115         for (i=0; ; i++) {
1116             if (m.find() == FALSE) {
1117                 break;
1118             }
1119             REGEX_ASSERT(m.start(status) == i);
1120             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1121         }
1122         REGEX_ASSERT(i==5);
1123     }
1124
1125
1126     //
1127     // Matchers with no input string behave as if they had an empty input string.
1128     //
1129
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         RegexMatcher  m(".?", 0, status);
1133         REGEX_CHECK_STATUS;
1134         REGEX_ASSERT(m.find());
1135         REGEX_ASSERT(m.start(status) == 0);
1136         REGEX_ASSERT(m.input() == "");
1137     }
1138     {
1139         UErrorCode status = U_ZERO_ERROR;
1140         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1141         RegexMatcher  *m = p->matcher(status);
1142         REGEX_CHECK_STATUS;
1143
1144         REGEX_ASSERT(m->find() == FALSE);
1145         REGEX_ASSERT(m->input() == "");
1146         delete m;
1147         delete p;
1148     }
1149
1150     //
1151     // Regions
1152     //
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeString testString("This is test data");
1156         RegexMatcher m(".*", testString,  0, status);
1157         REGEX_CHECK_STATUS;
1158         REGEX_ASSERT(m.regionStart() == 0);
1159         REGEX_ASSERT(m.regionEnd() == testString.length());
1160         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162
1163         m.region(2,4, status);
1164         REGEX_CHECK_STATUS;
1165         REGEX_ASSERT(m.matches(status));
1166         REGEX_ASSERT(m.start(status)==2);
1167         REGEX_ASSERT(m.end(status)==4);
1168         REGEX_CHECK_STATUS;
1169
1170         m.reset();
1171         REGEX_ASSERT(m.regionStart() == 0);
1172         REGEX_ASSERT(m.regionEnd() == testString.length());
1173
1174         UnicodeString shorterString("short");
1175         m.reset(shorterString);
1176         REGEX_ASSERT(m.regionStart() == 0);
1177         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1178
1179         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1180         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1181         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1182         REGEX_ASSERT(&m == &m.reset());
1183         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184
1185         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1186         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1187         REGEX_ASSERT(&m == &m.reset());
1188         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189
1190         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1191         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1192         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1193         REGEX_ASSERT(&m == &m.reset());
1194         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195
1196         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1197         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1198         REGEX_ASSERT(&m == &m.reset());
1199         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200
1201     }
1202
1203     //
1204     // hitEnd() and requireEnd()
1205     //
1206     {
1207         UErrorCode status = U_ZERO_ERROR;
1208         UnicodeString testString("aabb");
1209         RegexMatcher m1(".*", testString,  0, status);
1210         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1211         REGEX_ASSERT(m1.hitEnd() == TRUE);
1212         REGEX_ASSERT(m1.requireEnd() == FALSE);
1213         REGEX_CHECK_STATUS;
1214
1215         status = U_ZERO_ERROR;
1216         RegexMatcher m2("a*", testString, 0, status);
1217         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1218         REGEX_ASSERT(m2.hitEnd() == FALSE);
1219         REGEX_ASSERT(m2.requireEnd() == FALSE);
1220         REGEX_CHECK_STATUS;
1221
1222         status = U_ZERO_ERROR;
1223         RegexMatcher m3(".*$", testString, 0, status);
1224         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1225         REGEX_ASSERT(m3.hitEnd() == TRUE);
1226         REGEX_ASSERT(m3.requireEnd() == TRUE);
1227         REGEX_CHECK_STATUS;
1228     }
1229
1230
1231     //
1232     // Compilation error on reset with UChar *
1233     //   These were a hazard that people were stumbling over with runtime errors.
1234     //   Changed them to compiler errors by adding private methods that more closely
1235     //   matched the incorrect use of the functions.
1236     //
1237 #if 0
1238     {
1239         UErrorCode status = U_ZERO_ERROR;
1240         UChar ucharString[20];
1241         RegexMatcher m(".", 0, status);
1242         m.reset(ucharString);  // should not compile.
1243
1244         RegexPattern *p = RegexPattern::compile(".", 0, status);
1245         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1246
1247         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1248     }
1249 #endif
1250
1251     //
1252     //  Time Outs.
1253     //       Note:  These tests will need to be changed when the regexp engine is
1254     //              able to detect and cut short the exponential time behavior on
1255     //              this type of match.
1256     //
1257     {
1258         UErrorCode status = U_ZERO_ERROR;
1259         //    Enough 'a's in the string to cause the match to time out.
1260         //       (Each on additonal 'a' doubles the time)
1261         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1262         RegexMatcher matcher("(a+)+b", testString, 0, status);
1263         REGEX_CHECK_STATUS;
1264         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1265         matcher.setTimeLimit(100, status);
1266         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1267         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1269     }
1270     {
1271         UErrorCode status = U_ZERO_ERROR;
1272         //   Few enough 'a's to slip in under the time limit.
1273         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1274         RegexMatcher matcher("(a+)+b", testString, 0, status);
1275         REGEX_CHECK_STATUS;
1276         matcher.setTimeLimit(100, status);
1277         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1278         REGEX_CHECK_STATUS;
1279     }
1280
1281     //
1282     //  Stack Limits
1283     //
1284     {
1285         UErrorCode status = U_ZERO_ERROR;
1286         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1287
1288         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289         //   of the '+', and makes the stack frames larger.
1290         RegexMatcher matcher("(A)+A$", testString, 0, status);
1291
1292         // With the default stack, this match should fail to run
1293         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1294         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1295
1296         // With unlimited stack, it should run
1297         status = U_ZERO_ERROR;
1298         matcher.setStackLimit(0, status);
1299         REGEX_CHECK_STATUS;
1300         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1301         REGEX_CHECK_STATUS;
1302         REGEX_ASSERT(matcher.getStackLimit() == 0);
1303
1304         // With a limited stack, it the match should fail
1305         status = U_ZERO_ERROR;
1306         matcher.setStackLimit(10000, status);
1307         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1308         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1309         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1310     }
1311
1312         // A pattern that doesn't save state should work with
1313         //   a minimal sized stack
1314     {
1315         UErrorCode status = U_ZERO_ERROR;
1316         UnicodeString testString = "abc";
1317         RegexMatcher matcher("abc", testString, 0, status);
1318         REGEX_CHECK_STATUS;
1319         matcher.setStackLimit(30, status);
1320         REGEX_CHECK_STATUS;
1321         REGEX_ASSERT(matcher.matches(status) == TRUE);
1322         REGEX_CHECK_STATUS;
1323         REGEX_ASSERT(matcher.getStackLimit() == 30);
1324
1325         // Negative stack sizes should fail
1326         status = U_ZERO_ERROR;
1327         matcher.setStackLimit(1000, status);
1328         REGEX_CHECK_STATUS;
1329         matcher.setStackLimit(-1, status);
1330         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1331         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1332     }
1333
1334
1335 }
1336
1337
1338
1339
1340
1341
1342 //---------------------------------------------------------------------------
1343 //
1344 //      API_Replace        API test for class RegexMatcher, testing the
1345 //                         Replace family of functions.
1346 //
1347 //---------------------------------------------------------------------------
1348 void RegexTest::API_Replace() {
1349     //
1350     //  Replace
1351     //
1352     int32_t             flags=0;
1353     UParseError         pe;
1354     UErrorCode          status=U_ZERO_ERROR;
1355
1356     UnicodeString       re("abc");
1357     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1358     REGEX_CHECK_STATUS;
1359     UnicodeString data = ".abc..abc...abc..";
1360     //                    012345678901234567
1361     RegexMatcher *matcher = pat->matcher(data, status);
1362
1363     //
1364     //  Plain vanilla matches.
1365     //
1366     UnicodeString  dest;
1367     dest = matcher->replaceFirst("yz", status);
1368     REGEX_CHECK_STATUS;
1369     REGEX_ASSERT(dest == ".yz..abc...abc..");
1370
1371     dest = matcher->replaceAll("yz", status);
1372     REGEX_CHECK_STATUS;
1373     REGEX_ASSERT(dest == ".yz..yz...yz..");
1374
1375     //
1376     //  Plain vanilla non-matches.
1377     //
1378     UnicodeString d2 = ".abx..abx...abx..";
1379     matcher->reset(d2);
1380     dest = matcher->replaceFirst("yz", status);
1381     REGEX_CHECK_STATUS;
1382     REGEX_ASSERT(dest == ".abx..abx...abx..");
1383
1384     dest = matcher->replaceAll("yz", status);
1385     REGEX_CHECK_STATUS;
1386     REGEX_ASSERT(dest == ".abx..abx...abx..");
1387
1388     //
1389     // Empty source string
1390     //
1391     UnicodeString d3 = "";
1392     matcher->reset(d3);
1393     dest = matcher->replaceFirst("yz", status);
1394     REGEX_CHECK_STATUS;
1395     REGEX_ASSERT(dest == "");
1396
1397     dest = matcher->replaceAll("yz", status);
1398     REGEX_CHECK_STATUS;
1399     REGEX_ASSERT(dest == "");
1400
1401     //
1402     // Empty substitution string
1403     //
1404     matcher->reset(data);              // ".abc..abc...abc.."
1405     dest = matcher->replaceFirst("", status);
1406     REGEX_CHECK_STATUS;
1407     REGEX_ASSERT(dest == "...abc...abc..");
1408
1409     dest = matcher->replaceAll("", status);
1410     REGEX_CHECK_STATUS;
1411     REGEX_ASSERT(dest == "........");
1412
1413     //
1414     // match whole string
1415     //
1416     UnicodeString d4 = "abc";
1417     matcher->reset(d4);
1418     dest = matcher->replaceFirst("xyz", status);
1419     REGEX_CHECK_STATUS;
1420     REGEX_ASSERT(dest == "xyz");
1421
1422     dest = matcher->replaceAll("xyz", status);
1423     REGEX_CHECK_STATUS;
1424     REGEX_ASSERT(dest == "xyz");
1425
1426     //
1427     // Capture Group, simple case
1428     //
1429     UnicodeString       re2("a(..)");
1430     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1431     REGEX_CHECK_STATUS;
1432     UnicodeString d5 = "abcdefg";
1433     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1434     REGEX_CHECK_STATUS;
1435     dest = matcher2->replaceFirst("$1$1", status);
1436     REGEX_CHECK_STATUS;
1437     REGEX_ASSERT(dest == "bcbcdefg");
1438
1439     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1440     REGEX_CHECK_STATUS;
1441     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1442
1443     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1444     REGEX_ASSERT(U_FAILURE(status));
1445     status = U_ZERO_ERROR;
1446
1447     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448     replacement = replacement.unescape();
1449     dest = matcher2->replaceFirst(replacement, status);
1450     REGEX_CHECK_STATUS;
1451     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1452
1453     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1454
1455
1456     //
1457     // Replacement String with \u hex escapes
1458     //
1459     {
1460         UnicodeString  src = "abc 1 abc 2 abc 3";
1461         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1462         matcher->reset(src);
1463         UnicodeString  result = matcher->replaceAll(substitute, status);
1464         REGEX_CHECK_STATUS;
1465         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1466     }
1467     {
1468         UnicodeString  src = "abc !";
1469         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1470         matcher->reset(src);
1471         UnicodeString  result = matcher->replaceAll(substitute, status);
1472         REGEX_CHECK_STATUS;
1473         UnicodeString expected = UnicodeString("--");
1474         expected.append((UChar32)0x10000);
1475         expected.append("-- !");
1476         REGEX_ASSERT(result == expected);
1477     }
1478     // TODO:  need more through testing of capture substitutions.
1479
1480     // Bug 4057
1481     //
1482     {
1483         status = U_ZERO_ERROR;
1484         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1485         RegexMatcher m("ss(.*?)ee", 0, status);
1486         REGEX_CHECK_STATUS;
1487         UnicodeString result;
1488
1489         // Multiple finds do NOT bump up the previous appendReplacement postion.
1490         m.reset(s);
1491         m.find();
1492         m.find();
1493         m.appendReplacement(result, "ooh", status);
1494         REGEX_CHECK_STATUS;
1495         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1496
1497         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498         status = U_ZERO_ERROR;
1499         result.truncate(0);
1500         m.reset(10, status);
1501         m.find();
1502         m.find();
1503         m.appendReplacement(result, "ooh", status);
1504         REGEX_CHECK_STATUS;
1505         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1506
1507         // find() at interior of string, appendReplacemnt still starts at beginning.
1508         status = U_ZERO_ERROR;
1509         result.truncate(0);
1510         m.reset();
1511         m.find(10, status);
1512         m.find();
1513         m.appendReplacement(result, "ooh", status);
1514         REGEX_CHECK_STATUS;
1515         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1516
1517         m.appendTail(result);
1518         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1519
1520     }
1521
1522     delete matcher2;
1523     delete pat2;
1524     delete matcher;
1525     delete pat;
1526 }
1527
1528
1529 //---------------------------------------------------------------------------
1530 //
1531 //      API_Pattern       Test that the API for class RegexPattern is
1532 //                        present and nominally working.
1533 //
1534 //---------------------------------------------------------------------------
1535 void RegexTest::API_Pattern() {
1536     RegexPattern        pata;    // Test default constructor to not crash.
1537     RegexPattern        patb;
1538
1539     REGEX_ASSERT(pata == patb);
1540     REGEX_ASSERT(pata == pata);
1541
1542     UnicodeString re1("abc[a-l][m-z]");
1543     UnicodeString re2("def");
1544     UErrorCode    status = U_ZERO_ERROR;
1545     UParseError   pe;
1546
1547     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1548     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1549     REGEX_CHECK_STATUS;
1550     REGEX_ASSERT(*pat1 == *pat1);
1551     REGEX_ASSERT(*pat1 != pata);
1552
1553     // Assign
1554     patb = *pat1;
1555     REGEX_ASSERT(patb == *pat1);
1556
1557     // Copy Construct
1558     RegexPattern patc(*pat1);
1559     REGEX_ASSERT(patc == *pat1);
1560     REGEX_ASSERT(patb == patc);
1561     REGEX_ASSERT(pat1 != pat2);
1562     patb = *pat2;
1563     REGEX_ASSERT(patb != patc);
1564     REGEX_ASSERT(patb == *pat2);
1565
1566     // Compile with no flags.
1567     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1568     REGEX_ASSERT(*pat1a == *pat1);
1569
1570     REGEX_ASSERT(pat1a->flags() == 0);
1571
1572     // Compile with different flags should be not equal
1573     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1574     REGEX_CHECK_STATUS;
1575
1576     REGEX_ASSERT(*pat1b != *pat1a);
1577     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1578     REGEX_ASSERT(pat1a->flags() == 0);
1579     delete pat1b;
1580
1581     // clone
1582     RegexPattern *pat1c = pat1->clone();
1583     REGEX_ASSERT(*pat1c == *pat1);
1584     REGEX_ASSERT(*pat1c != *pat2);
1585
1586     delete pat1c;
1587     delete pat1a;
1588     delete pat1;
1589     delete pat2;
1590
1591
1592     //
1593     //   Verify that a matcher created from a cloned pattern works.
1594     //     (Jitterbug 3423)
1595     //
1596     {
1597         UErrorCode     status     = U_ZERO_ERROR;
1598         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1599         RegexPattern  *pClone     = pSource->clone();
1600         delete         pSource;
1601         RegexMatcher  *mFromClone = pClone->matcher(status);
1602         REGEX_CHECK_STATUS;
1603         UnicodeString s = "Hello World";
1604         mFromClone->reset(s);
1605         REGEX_ASSERT(mFromClone->find() == TRUE);
1606         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1607         REGEX_ASSERT(mFromClone->find() == TRUE);
1608         REGEX_ASSERT(mFromClone->group(status) == "World");
1609         REGEX_ASSERT(mFromClone->find() == FALSE);
1610         delete mFromClone;
1611         delete pClone;
1612     }
1613
1614     //
1615     //   matches convenience API
1616     //
1617     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1618     REGEX_CHECK_STATUS;
1619     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1620     REGEX_CHECK_STATUS;
1621     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1626     REGEX_CHECK_STATUS;
1627     status = U_INDEX_OUTOFBOUNDS_ERROR;
1628     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1629     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1630
1631
1632     //
1633     // Split()
1634     //
1635     status = U_ZERO_ERROR;
1636     pat1 = RegexPattern::compile(" +",  pe, status);
1637     REGEX_CHECK_STATUS;
1638     UnicodeString  fields[10];
1639
1640     int32_t n;
1641     n = pat1->split("Now is the time", fields, 10, status);
1642     REGEX_CHECK_STATUS;
1643     REGEX_ASSERT(n==4);
1644     REGEX_ASSERT(fields[0]=="Now");
1645     REGEX_ASSERT(fields[1]=="is");
1646     REGEX_ASSERT(fields[2]=="the");
1647     REGEX_ASSERT(fields[3]=="time");
1648     REGEX_ASSERT(fields[4]=="");
1649
1650     n = pat1->split("Now is the time", fields, 2, status);
1651     REGEX_CHECK_STATUS;
1652     REGEX_ASSERT(n==2);
1653     REGEX_ASSERT(fields[0]=="Now");
1654     REGEX_ASSERT(fields[1]=="is the time");
1655     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1656
1657     fields[1] = "*";
1658     status = U_ZERO_ERROR;
1659     n = pat1->split("Now is the time", fields, 1, status);
1660     REGEX_CHECK_STATUS;
1661     REGEX_ASSERT(n==1);
1662     REGEX_ASSERT(fields[0]=="Now is the time");
1663     REGEX_ASSERT(fields[1]=="*");
1664     status = U_ZERO_ERROR;
1665
1666     n = pat1->split("    Now       is the time   ", fields, 10, status);
1667     REGEX_CHECK_STATUS;
1668     REGEX_ASSERT(n==6);
1669     REGEX_ASSERT(fields[0]=="");
1670     REGEX_ASSERT(fields[1]=="Now");
1671     REGEX_ASSERT(fields[2]=="is");
1672     REGEX_ASSERT(fields[3]=="the");
1673     REGEX_ASSERT(fields[4]=="time");
1674     REGEX_ASSERT(fields[5]=="");
1675
1676     n = pat1->split("     ", fields, 10, status);
1677     REGEX_CHECK_STATUS;
1678     REGEX_ASSERT(n==2);
1679     REGEX_ASSERT(fields[0]=="");
1680     REGEX_ASSERT(fields[1]=="");
1681
1682     fields[0] = "foo";
1683     n = pat1->split("", fields, 10, status);
1684     REGEX_CHECK_STATUS;
1685     REGEX_ASSERT(n==0);
1686     REGEX_ASSERT(fields[0]=="foo");
1687
1688     delete pat1;
1689
1690     //  split, with a pattern with (capture)
1691     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1692     REGEX_CHECK_STATUS;
1693
1694     status = U_ZERO_ERROR;
1695     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1696     REGEX_CHECK_STATUS;
1697     REGEX_ASSERT(n==7);
1698     REGEX_ASSERT(fields[0]=="");
1699     REGEX_ASSERT(fields[1]=="a");
1700     REGEX_ASSERT(fields[2]=="Now is ");
1701     REGEX_ASSERT(fields[3]=="b");
1702     REGEX_ASSERT(fields[4]=="the time");
1703     REGEX_ASSERT(fields[5]=="c");
1704     REGEX_ASSERT(fields[6]=="");
1705     REGEX_ASSERT(status==U_ZERO_ERROR);
1706
1707     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1708     REGEX_CHECK_STATUS;
1709     REGEX_ASSERT(n==7);
1710     REGEX_ASSERT(fields[0]=="  ");
1711     REGEX_ASSERT(fields[1]=="a");
1712     REGEX_ASSERT(fields[2]=="Now is ");
1713     REGEX_ASSERT(fields[3]=="b");
1714     REGEX_ASSERT(fields[4]=="the time");
1715     REGEX_ASSERT(fields[5]=="c");
1716     REGEX_ASSERT(fields[6]=="");
1717
1718     status = U_ZERO_ERROR;
1719     fields[6] = "foo";
1720     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1721     REGEX_CHECK_STATUS;
1722     REGEX_ASSERT(n==6);
1723     REGEX_ASSERT(fields[0]=="  ");
1724     REGEX_ASSERT(fields[1]=="a");
1725     REGEX_ASSERT(fields[2]=="Now is ");
1726     REGEX_ASSERT(fields[3]=="b");
1727     REGEX_ASSERT(fields[4]=="the time");
1728     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1729     REGEX_ASSERT(fields[6]=="foo");
1730
1731     status = U_ZERO_ERROR;
1732     fields[5] = "foo";
1733     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1734     REGEX_CHECK_STATUS;
1735     REGEX_ASSERT(n==5);
1736     REGEX_ASSERT(fields[0]=="  ");
1737     REGEX_ASSERT(fields[1]=="a");
1738     REGEX_ASSERT(fields[2]=="Now is ");
1739     REGEX_ASSERT(fields[3]=="b");
1740     REGEX_ASSERT(fields[4]=="the time<c>");
1741     REGEX_ASSERT(fields[5]=="foo");
1742
1743     status = U_ZERO_ERROR;
1744     fields[5] = "foo";
1745     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1746     REGEX_CHECK_STATUS;
1747     REGEX_ASSERT(n==5);
1748     REGEX_ASSERT(fields[0]=="  ");
1749     REGEX_ASSERT(fields[1]=="a");
1750     REGEX_ASSERT(fields[2]=="Now is ");
1751     REGEX_ASSERT(fields[3]=="b");
1752     REGEX_ASSERT(fields[4]=="the time");
1753     REGEX_ASSERT(fields[5]=="foo");
1754
1755     status = U_ZERO_ERROR;
1756     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1757     REGEX_CHECK_STATUS;
1758     REGEX_ASSERT(n==4);
1759     REGEX_ASSERT(fields[0]=="  ");
1760     REGEX_ASSERT(fields[1]=="a");
1761     REGEX_ASSERT(fields[2]=="Now is ");
1762     REGEX_ASSERT(fields[3]=="the time<c>");
1763     status = U_ZERO_ERROR;
1764     delete pat1;
1765
1766     pat1 = RegexPattern::compile("([-,])",  pe, status);
1767     REGEX_CHECK_STATUS;
1768     n = pat1->split("1-10,20", fields, 10, status);
1769     REGEX_CHECK_STATUS;
1770     REGEX_ASSERT(n==5);
1771     REGEX_ASSERT(fields[0]=="1");
1772     REGEX_ASSERT(fields[1]=="-");
1773     REGEX_ASSERT(fields[2]=="10");
1774     REGEX_ASSERT(fields[3]==",");
1775     REGEX_ASSERT(fields[4]=="20");
1776     delete pat1;
1777
1778     // Test split of string with empty trailing fields
1779     pat1 = RegexPattern::compile(",", pe, status);
1780     REGEX_CHECK_STATUS;
1781     n = pat1->split("a,b,c,", fields, 10, status);
1782     REGEX_CHECK_STATUS;
1783     REGEX_ASSERT(n==4);
1784     REGEX_ASSERT(fields[0]=="a");
1785     REGEX_ASSERT(fields[1]=="b");
1786     REGEX_ASSERT(fields[2]=="c");
1787     REGEX_ASSERT(fields[3]=="");
1788
1789     n = pat1->split("a,,,", fields, 10, status);
1790     REGEX_CHECK_STATUS;
1791     REGEX_ASSERT(n==4);
1792     REGEX_ASSERT(fields[0]=="a");
1793     REGEX_ASSERT(fields[1]=="");
1794     REGEX_ASSERT(fields[2]=="");
1795     REGEX_ASSERT(fields[3]=="");
1796     delete pat1;
1797
1798     // Split Separator with zero length match.
1799     pat1 = RegexPattern::compile(":?", pe, status);
1800     REGEX_CHECK_STATUS;
1801     n = pat1->split("abc", fields, 10, status);
1802     REGEX_CHECK_STATUS;
1803     REGEX_ASSERT(n==5);
1804     REGEX_ASSERT(fields[0]=="");
1805     REGEX_ASSERT(fields[1]=="a");
1806     REGEX_ASSERT(fields[2]=="b");
1807     REGEX_ASSERT(fields[3]=="c");
1808     REGEX_ASSERT(fields[4]=="");
1809
1810     delete pat1;
1811
1812     //
1813     // RegexPattern::pattern()
1814     //
1815     pat1 = new RegexPattern();
1816     REGEX_ASSERT(pat1->pattern() == "");
1817     delete pat1;
1818
1819     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1820     REGEX_CHECK_STATUS;
1821     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1822     delete pat1;
1823
1824
1825     //
1826     // classID functions
1827     //
1828     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1829     REGEX_CHECK_STATUS;
1830     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1831     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1832     UnicodeString Hello("Hello, world.");
1833     RegexMatcher *m = pat1->matcher(Hello, status);
1834     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1835     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1837     delete m;
1838     delete pat1;
1839
1840 }
1841
1842 //---------------------------------------------------------------------------
1843 //
1844 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1845 //                       is present and working, but excluding functions
1846 //                       implementing replace operations.
1847 //
1848 //---------------------------------------------------------------------------
1849 void RegexTest::API_Match_UTF8() {
1850     UParseError         pe;
1851     UErrorCode          status=U_ZERO_ERROR;
1852     int32_t             flags = 0;
1853
1854     //
1855     // Debug - slide failing test cases early
1856     //
1857 #if 0
1858     {
1859     }
1860     return;
1861 #endif
1862
1863     //
1864     // Simple pattern compilation
1865     //
1866     {
1867         UText               re = UTEXT_INITIALIZER;
1868         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1869         REGEX_VERBOSE_TEXT(&re);
1870         RegexPattern        *pat2;
1871         pat2 = RegexPattern::compile(&re, flags, pe, status);
1872         REGEX_CHECK_STATUS;
1873
1874         UText input1 = UTEXT_INITIALIZER;
1875         UText input2 = UTEXT_INITIALIZER;
1876         UText empty  = UTEXT_INITIALIZER;
1877         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1878         REGEX_VERBOSE_TEXT(&input1);
1879         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1880         REGEX_VERBOSE_TEXT(&input2);
1881         utext_openUChars(&empty, NULL, 0, &status);
1882
1883         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884         int32_t input2Len = strlen("not abc");
1885
1886
1887         //
1888         // Matcher creation and reset.
1889         //
1890         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1891         REGEX_CHECK_STATUS;
1892         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1893         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1895         m1->reset(&input2);
1896         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1897         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1899         m1->reset(&input1);
1900         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1901         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1902         m1->reset(&empty);
1903         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1904         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1905
1906         //
1907         //  reset(pos, status)
1908         //
1909         m1->reset(&input1);
1910         m1->reset(4, status);
1911         REGEX_CHECK_STATUS;
1912         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1913         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1914
1915         m1->reset(-1, status);
1916         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1917         status = U_ZERO_ERROR;
1918
1919         m1->reset(0, status);
1920         REGEX_CHECK_STATUS;
1921         status = U_ZERO_ERROR;
1922
1923         m1->reset(input1Len-1, status);
1924         REGEX_CHECK_STATUS;
1925         status = U_ZERO_ERROR;
1926
1927         m1->reset(input1Len, status);
1928         REGEX_CHECK_STATUS;
1929         status = U_ZERO_ERROR;
1930
1931         m1->reset(input1Len+1, status);
1932         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1933         status = U_ZERO_ERROR;
1934
1935         //
1936         // match(pos, status)
1937         //
1938         m1->reset(&input2);
1939         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1940         m1->reset();
1941         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1942         m1->reset();
1943         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1944         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1945         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1946         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1947
1948         // Match() at end of string should fail, but should not
1949         //  be an error.
1950         status = U_ZERO_ERROR;
1951         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1952         REGEX_CHECK_STATUS;
1953
1954         // Match beyond end of string should fail with an error.
1955         status = U_ZERO_ERROR;
1956         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1957         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958
1959         // Successful match at end of string.
1960         {
1961             status = U_ZERO_ERROR;
1962             RegexMatcher m("A?", 0, status);  // will match zero length string.
1963             REGEX_CHECK_STATUS;
1964             m.reset(&input1);
1965             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1966             REGEX_CHECK_STATUS;
1967             m.reset(&empty);
1968             REGEX_ASSERT(m.matches(0, status) == TRUE);
1969             REGEX_CHECK_STATUS;
1970         }
1971
1972
1973         //
1974         // lookingAt(pos, status)
1975         //
1976         status = U_ZERO_ERROR;
1977         m1->reset(&input2);  // "not abc"
1978         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1980         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1981         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1982         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1983         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1984         status = U_ZERO_ERROR;
1985         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1986         REGEX_CHECK_STATUS;
1987         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1988         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1989
1990         delete m1;
1991         delete pat2;
1992
1993         utext_close(&re);
1994         utext_close(&input1);
1995         utext_close(&input2);
1996         utext_close(&empty);
1997     }
1998
1999
2000     //
2001     // Capture Group.
2002     //     RegexMatcher::start();
2003     //     RegexMatcher::end();
2004     //     RegexMatcher::groupCount();
2005     //
2006     {
2007         int32_t             flags=0;
2008         UParseError         pe;
2009         UErrorCode          status=U_ZERO_ERROR;
2010         UText               re=UTEXT_INITIALIZER;
2011         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012         utext_openUTF8(&re, str_01234567_pat, -1, &status);
2013
2014         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2015         REGEX_CHECK_STATUS;
2016
2017         UText input = UTEXT_INITIALIZER;
2018         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019         utext_openUTF8(&input, str_0123456789, -1, &status);
2020
2021         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2022         REGEX_CHECK_STATUS;
2023         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2024         static const int32_t matchStarts[] = {0,  2, 4, 8};
2025         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2026         int32_t i;
2027         for (i=0; i<4; i++) {
2028             int32_t actualStart = matcher->start(i, status);
2029             REGEX_CHECK_STATUS;
2030             if (actualStart != matchStarts[i]) {
2031                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2032                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2033             }
2034             int32_t actualEnd = matcher->end(i, status);
2035             REGEX_CHECK_STATUS;
2036             if (actualEnd != matchEnds[i]) {
2037                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2038                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2039             }
2040         }
2041
2042         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2043         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2044
2045         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2046         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2047         matcher->reset();
2048         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2049
2050         matcher->lookingAt(status);
2051
2052         UnicodeString dest;
2053         UText destText = UTEXT_INITIALIZER;
2054         utext_openUnicodeString(&destText, &dest, &status);
2055         UText *result;
2056         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057         //  Test shallow-clone API
2058         int64_t   group_len;
2059         result = matcher->group((UText *)NULL, group_len, status);
2060         REGEX_CHECK_STATUS;
2061         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2062         utext_close(result);
2063         result = matcher->group(0, &destText, group_len, status);
2064         REGEX_CHECK_STATUS;
2065         REGEX_ASSERT(result == &destText);
2066         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2067         //  destText is now immutable, reopen it
2068         utext_close(&destText);
2069         utext_openUnicodeString(&destText, &dest, &status);
2070
2071         int64_t length;
2072         result = matcher->group(0, NULL, length, status);
2073         REGEX_CHECK_STATUS;
2074         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2075         utext_close(result);
2076         result = matcher->group(0, &destText, length, status);
2077         REGEX_CHECK_STATUS;
2078         REGEX_ASSERT(result == &destText);
2079         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080         REGEX_ASSERT(length == 10);
2081         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2082
2083         // Capture Group 1 == "234567"
2084         result = matcher->group(1, NULL, length, status);
2085         REGEX_CHECK_STATUS;
2086         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2087         REGEX_ASSERT(length == 6);
2088         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2089         utext_close(result);
2090
2091         result = matcher->group(1, &destText, length, status);
2092         REGEX_CHECK_STATUS;
2093         REGEX_ASSERT(result == &destText);
2094         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095         REGEX_ASSERT(length == 6);
2096         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097         utext_close(result);
2098
2099         // Capture Group 2 == "45"
2100         result = matcher->group(2, NULL, length, status);
2101         REGEX_CHECK_STATUS;
2102         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2103         REGEX_ASSERT(length == 2);
2104         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2105         utext_close(result);
2106
2107         result = matcher->group(2, &destText, length, status);
2108         REGEX_CHECK_STATUS;
2109         REGEX_ASSERT(result == &destText);
2110         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111         REGEX_ASSERT(length == 2);
2112         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113         utext_close(result);
2114
2115         // Capture Group 3 == "89"
2116         result = matcher->group(3, NULL, length, status);
2117         REGEX_CHECK_STATUS;
2118         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2119         REGEX_ASSERT(length == 2);
2120         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2121         utext_close(result);
2122
2123         result = matcher->group(3, &destText, length, status);
2124         REGEX_CHECK_STATUS;
2125         REGEX_ASSERT(result == &destText);
2126         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127         REGEX_ASSERT(length == 2);
2128         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129         utext_close(result);
2130
2131         // Capture Group number out of range.
2132         status = U_ZERO_ERROR;
2133         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134         status = U_ZERO_ERROR;
2135         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136         status = U_ZERO_ERROR;
2137         matcher->reset();
2138         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2139
2140         delete matcher;
2141         delete pat;
2142
2143         utext_close(&destText);
2144         utext_close(&input);
2145         utext_close(&re);
2146     }
2147
2148     //
2149     //  find
2150     //
2151     {
2152         int32_t             flags=0;
2153         UParseError         pe;
2154         UErrorCode          status=U_ZERO_ERROR;
2155         UText               re=UTEXT_INITIALIZER;
2156         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157         utext_openUTF8(&re, str_abc, -1, &status);
2158
2159         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2160         REGEX_CHECK_STATUS;
2161         UText input = UTEXT_INITIALIZER;
2162         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164         //                      012345678901234567
2165
2166         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167         REGEX_CHECK_STATUS;
2168         REGEX_ASSERT(matcher->find());
2169         REGEX_ASSERT(matcher->start(status) == 1);
2170         REGEX_ASSERT(matcher->find());
2171         REGEX_ASSERT(matcher->start(status) == 6);
2172         REGEX_ASSERT(matcher->find());
2173         REGEX_ASSERT(matcher->start(status) == 12);
2174         REGEX_ASSERT(matcher->find() == FALSE);
2175         REGEX_ASSERT(matcher->find() == FALSE);
2176
2177         matcher->reset();
2178         REGEX_ASSERT(matcher->find());
2179         REGEX_ASSERT(matcher->start(status) == 1);
2180
2181         REGEX_ASSERT(matcher->find(0, status));
2182         REGEX_ASSERT(matcher->start(status) == 1);
2183         REGEX_ASSERT(matcher->find(1, status));
2184         REGEX_ASSERT(matcher->start(status) == 1);
2185         REGEX_ASSERT(matcher->find(2, status));
2186         REGEX_ASSERT(matcher->start(status) == 6);
2187         REGEX_ASSERT(matcher->find(12, status));
2188         REGEX_ASSERT(matcher->start(status) == 12);
2189         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2190         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2191         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2192         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2193
2194         status = U_ZERO_ERROR;
2195         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2196         status = U_ZERO_ERROR;
2197         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198
2199         REGEX_ASSERT(matcher->groupCount() == 0);
2200
2201         delete matcher;
2202         delete pat;
2203
2204         utext_close(&input);
2205         utext_close(&re);
2206     }
2207
2208
2209     //
2210     //  find, with \G in pattern (true if at the end of a previous match).
2211     //
2212     {
2213         int32_t             flags=0;
2214         UParseError         pe;
2215         UErrorCode          status=U_ZERO_ERROR;
2216         UText               re=UTEXT_INITIALIZER;
2217         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2219
2220         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2221
2222         REGEX_CHECK_STATUS;
2223         UText input = UTEXT_INITIALIZER;
2224         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2226         //                      012345678901234567
2227
2228         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2229         REGEX_CHECK_STATUS;
2230         REGEX_ASSERT(matcher->find());
2231         REGEX_ASSERT(matcher->start(status) == 0);
2232         REGEX_ASSERT(matcher->start(1, status) == -1);
2233         REGEX_ASSERT(matcher->start(2, status) == 1);
2234
2235         REGEX_ASSERT(matcher->find());
2236         REGEX_ASSERT(matcher->start(status) == 4);
2237         REGEX_ASSERT(matcher->start(1, status) == 4);
2238         REGEX_ASSERT(matcher->start(2, status) == -1);
2239         REGEX_CHECK_STATUS;
2240
2241         delete matcher;
2242         delete pat;
2243
2244         utext_close(&input);
2245         utext_close(&re);
2246     }
2247
2248     //
2249     //   find with zero length matches, match position should bump ahead
2250     //     to prevent loops.
2251     //
2252     {
2253         int32_t                 i;
2254         UErrorCode          status=U_ZERO_ERROR;
2255         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2256                                                       //   using an always-true look-ahead.
2257         REGEX_CHECK_STATUS;
2258         UText s = UTEXT_INITIALIZER;
2259         utext_openUTF8(&s, "    ", -1, &status);
2260         m.reset(&s);
2261         for (i=0; ; i++) {
2262             if (m.find() == FALSE) {
2263                 break;
2264             }
2265             REGEX_ASSERT(m.start(status) == i);
2266             REGEX_ASSERT(m.end(status) == i);
2267         }
2268         REGEX_ASSERT(i==5);
2269
2270         // Check that the bump goes over characters outside the BMP OK
2271         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2274         m.reset(&s);
2275         for (i=0; ; i+=4) {
2276             if (m.find() == FALSE) {
2277                 break;
2278             }
2279             REGEX_ASSERT(m.start(status) == i);
2280             REGEX_ASSERT(m.end(status) == i);
2281         }
2282         REGEX_ASSERT(i==20);
2283
2284         utext_close(&s);
2285     }
2286     {
2287         // find() loop breaking test.
2288         //        with pattern of /.?/, should see a series of one char matches, then a single
2289         //        match of zero length at the end of the input string.
2290         int32_t                 i;
2291         UErrorCode          status=U_ZERO_ERROR;
2292         RegexMatcher        m(".?", 0, status);
2293         REGEX_CHECK_STATUS;
2294         UText s = UTEXT_INITIALIZER;
2295         utext_openUTF8(&s, "    ", -1, &status);
2296         m.reset(&s);
2297         for (i=0; ; i++) {
2298             if (m.find() == FALSE) {
2299                 break;
2300             }
2301             REGEX_ASSERT(m.start(status) == i);
2302             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2303         }
2304         REGEX_ASSERT(i==5);
2305
2306         utext_close(&s);
2307     }
2308
2309
2310     //
2311     // Matchers with no input string behave as if they had an empty input string.
2312     //
2313
2314     {
2315         UErrorCode status = U_ZERO_ERROR;
2316         RegexMatcher  m(".?", 0, status);
2317         REGEX_CHECK_STATUS;
2318         REGEX_ASSERT(m.find());
2319         REGEX_ASSERT(m.start(status) == 0);
2320         REGEX_ASSERT(m.input() == "");
2321     }
2322     {
2323         UErrorCode status = U_ZERO_ERROR;
2324         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2325         RegexMatcher  *m = p->matcher(status);
2326         REGEX_CHECK_STATUS;
2327
2328         REGEX_ASSERT(m->find() == FALSE);
2329         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2330         delete m;
2331         delete p;
2332     }
2333
2334     //
2335     // Regions
2336     //
2337     {
2338         UErrorCode status = U_ZERO_ERROR;
2339         UText testPattern = UTEXT_INITIALIZER;
2340         UText testText    = UTEXT_INITIALIZER;
2341         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2342         REGEX_VERBOSE_TEXT(&testPattern);
2343         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2344         REGEX_VERBOSE_TEXT(&testText);
2345
2346         RegexMatcher m(&testPattern, &testText, 0, status);
2347         REGEX_CHECK_STATUS;
2348         REGEX_ASSERT(m.regionStart() == 0);
2349         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2350         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2351         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352
2353         m.region(2,4, status);
2354         REGEX_CHECK_STATUS;
2355         REGEX_ASSERT(m.matches(status));
2356         REGEX_ASSERT(m.start(status)==2);
2357         REGEX_ASSERT(m.end(status)==4);
2358         REGEX_CHECK_STATUS;
2359
2360         m.reset();
2361         REGEX_ASSERT(m.regionStart() == 0);
2362         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2363
2364         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2365         REGEX_VERBOSE_TEXT(&testText);
2366         m.reset(&testText);
2367         REGEX_ASSERT(m.regionStart() == 0);
2368         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2369
2370         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2371         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2372         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2373         REGEX_ASSERT(&m == &m.reset());
2374         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375
2376         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2377         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2378         REGEX_ASSERT(&m == &m.reset());
2379         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380
2381         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2382         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2383         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2384         REGEX_ASSERT(&m == &m.reset());
2385         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386
2387         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2388         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2389         REGEX_ASSERT(&m == &m.reset());
2390         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391
2392         utext_close(&testText);
2393         utext_close(&testPattern);
2394     }
2395
2396     //
2397     // hitEnd() and requireEnd()
2398     //
2399     {
2400         UErrorCode status = U_ZERO_ERROR;
2401         UText testPattern = UTEXT_INITIALIZER;
2402         UText testText    = UTEXT_INITIALIZER;
2403         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405         utext_openUTF8(&testPattern, str_, -1, &status);
2406         utext_openUTF8(&testText, str_aabb, -1, &status);
2407
2408         RegexMatcher m1(&testPattern, &testText,  0, status);
2409         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2410         REGEX_ASSERT(m1.hitEnd() == TRUE);
2411         REGEX_ASSERT(m1.requireEnd() == FALSE);
2412         REGEX_CHECK_STATUS;
2413
2414         status = U_ZERO_ERROR;
2415         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416         utext_openUTF8(&testPattern, str_a, -1, &status);
2417         RegexMatcher m2(&testPattern, &testText, 0, status);
2418         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2419         REGEX_ASSERT(m2.hitEnd() == FALSE);
2420         REGEX_ASSERT(m2.requireEnd() == FALSE);
2421         REGEX_CHECK_STATUS;
2422
2423         status = U_ZERO_ERROR;
2424         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2426         RegexMatcher m3(&testPattern, &testText, 0, status);
2427         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2428         REGEX_ASSERT(m3.hitEnd() == TRUE);
2429         REGEX_ASSERT(m3.requireEnd() == TRUE);
2430         REGEX_CHECK_STATUS;
2431
2432         utext_close(&testText);
2433         utext_close(&testPattern);
2434     }
2435 }
2436
2437
2438 //---------------------------------------------------------------------------
2439 //
2440 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2441 //                         Replace family of functions.
2442 //
2443 //---------------------------------------------------------------------------
2444 void RegexTest::API_Replace_UTF8() {
2445     //
2446     //  Replace
2447     //
2448     int32_t             flags=0;
2449     UParseError         pe;
2450     UErrorCode          status=U_ZERO_ERROR;
2451
2452     UText               re=UTEXT_INITIALIZER;
2453     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2454     REGEX_VERBOSE_TEXT(&re);
2455     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2456     REGEX_CHECK_STATUS;
2457
2458     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459     //             012345678901234567
2460     UText dataText = UTEXT_INITIALIZER;
2461     utext_openUTF8(&dataText, data, -1, &status);
2462     REGEX_CHECK_STATUS;
2463     REGEX_VERBOSE_TEXT(&dataText);
2464     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2465
2466     //
2467     //  Plain vanilla matches.
2468     //
2469     UnicodeString  dest;
2470     UText destText = UTEXT_INITIALIZER;
2471     utext_openUnicodeString(&destText, &dest, &status);
2472     UText *result;
2473
2474     UText replText = UTEXT_INITIALIZER;
2475
2476     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477     utext_openUTF8(&replText, str_yz, -1, &status);
2478     REGEX_VERBOSE_TEXT(&replText);
2479     result = matcher->replaceFirst(&replText, NULL, status);
2480     REGEX_CHECK_STATUS;
2481     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2483     utext_close(result);
2484     result = matcher->replaceFirst(&replText, &destText, status);
2485     REGEX_CHECK_STATUS;
2486     REGEX_ASSERT(result == &destText);
2487     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2488
2489     result = matcher->replaceAll(&replText, NULL, status);
2490     REGEX_CHECK_STATUS;
2491     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2493     utext_close(result);
2494
2495     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496     result = matcher->replaceAll(&replText, &destText, status);
2497     REGEX_CHECK_STATUS;
2498     REGEX_ASSERT(result == &destText);
2499     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2500
2501     //
2502     //  Plain vanilla non-matches.
2503     //
2504     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2506     matcher->reset(&dataText);
2507
2508     result = matcher->replaceFirst(&replText, NULL, status);
2509     REGEX_CHECK_STATUS;
2510     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2511     utext_close(result);
2512     result = matcher->replaceFirst(&replText, &destText, status);
2513     REGEX_CHECK_STATUS;
2514     REGEX_ASSERT(result == &destText);
2515     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2516
2517     result = matcher->replaceAll(&replText, NULL, status);
2518     REGEX_CHECK_STATUS;
2519     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2520     utext_close(result);
2521     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522     result = matcher->replaceAll(&replText, &destText, status);
2523     REGEX_CHECK_STATUS;
2524     REGEX_ASSERT(result == &destText);
2525     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2526
2527     //
2528     // Empty source string
2529     //
2530     utext_openUTF8(&dataText, NULL, 0, &status);
2531     matcher->reset(&dataText);
2532
2533     result = matcher->replaceFirst(&replText, NULL, status);
2534     REGEX_CHECK_STATUS;
2535     REGEX_ASSERT_UTEXT_UTF8("", result);
2536     utext_close(result);
2537     result = matcher->replaceFirst(&replText, &destText, status);
2538     REGEX_CHECK_STATUS;
2539     REGEX_ASSERT(result == &destText);
2540     REGEX_ASSERT_UTEXT_UTF8("", result);
2541
2542     result = matcher->replaceAll(&replText, NULL, status);
2543     REGEX_CHECK_STATUS;
2544     REGEX_ASSERT_UTEXT_UTF8("", result);
2545     utext_close(result);
2546     result = matcher->replaceAll(&replText, &destText, status);
2547     REGEX_CHECK_STATUS;
2548     REGEX_ASSERT(result == &destText);
2549     REGEX_ASSERT_UTEXT_UTF8("", result);
2550
2551     //
2552     // Empty substitution string
2553     //
2554     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2555     matcher->reset(&dataText);
2556
2557     utext_openUTF8(&replText, NULL, 0, &status);
2558     result = matcher->replaceFirst(&replText, NULL, status);
2559     REGEX_CHECK_STATUS;
2560     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2562     utext_close(result);
2563     result = matcher->replaceFirst(&replText, &destText, status);
2564     REGEX_CHECK_STATUS;
2565     REGEX_ASSERT(result == &destText);
2566     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2567
2568     result = matcher->replaceAll(&replText, NULL, status);
2569     REGEX_CHECK_STATUS;
2570     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2572     utext_close(result);
2573     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2574     result = matcher->replaceAll(&replText, &destText, status);
2575     REGEX_CHECK_STATUS;
2576     REGEX_ASSERT(result == &destText);
2577     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2578
2579     //
2580     // match whole string
2581     //
2582     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583     utext_openUTF8(&dataText, str_abc, -1, &status);
2584     matcher->reset(&dataText);
2585
2586     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587     utext_openUTF8(&replText, str_xyz, -1, &status);
2588     result = matcher->replaceFirst(&replText, NULL, status);
2589     REGEX_CHECK_STATUS;
2590     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2591     utext_close(result);
2592     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593     result = matcher->replaceFirst(&replText, &destText, status);
2594     REGEX_CHECK_STATUS;
2595     REGEX_ASSERT(result == &destText);
2596     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2597
2598     result = matcher->replaceAll(&replText, NULL, status);
2599     REGEX_CHECK_STATUS;
2600     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2601     utext_close(result);
2602     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603     result = matcher->replaceAll(&replText, &destText, status);
2604     REGEX_CHECK_STATUS;
2605     REGEX_ASSERT(result == &destText);
2606     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2607
2608     //
2609     // Capture Group, simple case
2610     //
2611     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612     utext_openUTF8(&re, str_add, -1, &status);
2613     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2614     REGEX_CHECK_STATUS;
2615
2616     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2618     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2619     REGEX_CHECK_STATUS;
2620
2621     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622     utext_openUTF8(&replText, str_11, -1, &status);
2623     result = matcher2->replaceFirst(&replText, NULL, status);
2624     REGEX_CHECK_STATUS;
2625     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2627     utext_close(result);
2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629     result = matcher2->replaceFirst(&replText, &destText, status);
2630     REGEX_CHECK_STATUS;
2631     REGEX_ASSERT(result == &destText);
2632     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2633
2634     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635     utext_openUTF8(&replText, str_v, -1, &status);
2636     REGEX_VERBOSE_TEXT(&replText);
2637     result = matcher2->replaceFirst(&replText, NULL, status);
2638     REGEX_CHECK_STATUS;
2639     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2641     utext_close(result);
2642     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643     result = matcher2->replaceFirst(&replText, &destText, status);
2644     REGEX_CHECK_STATUS;
2645     REGEX_ASSERT(result == &destText);
2646     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2647
2648     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2652     result = matcher2->replaceFirst(&replText, NULL, status);
2653     REGEX_CHECK_STATUS;
2654     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2656     utext_close(result);
2657     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2658     result = matcher2->replaceFirst(&replText, &destText, status);
2659     REGEX_CHECK_STATUS;
2660     REGEX_ASSERT(result == &destText);
2661     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2662
2663     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665     //                                 012345678901234567890123456
2666     supplDigitChars[22] = 0xF0;
2667     supplDigitChars[23] = 0x9D;
2668     supplDigitChars[24] = 0x9F;
2669     supplDigitChars[25] = 0x8F;
2670     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2671
2672     result = matcher2->replaceFirst(&replText, NULL, status);
2673     REGEX_CHECK_STATUS;
2674     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2676     utext_close(result);
2677     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678     result = matcher2->replaceFirst(&replText, &destText, status);
2679     REGEX_CHECK_STATUS;
2680     REGEX_ASSERT(result == &destText);
2681     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2682     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2683     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2684     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2685 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686     utext_close(result);
2687     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2688     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2689     REGEX_ASSERT(result == &destText);
2690 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2691
2692     //
2693     // Replacement String with \u hex escapes
2694     //
2695     {
2696       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2699         utext_openUTF8(&replText, str_u0043, -1, &status);
2700         matcher->reset(&dataText);
2701
2702         result = matcher->replaceAll(&replText, NULL, status);
2703         REGEX_CHECK_STATUS;
2704         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2706         utext_close(result);
2707         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2708         result = matcher->replaceAll(&replText, &destText, status);
2709         REGEX_CHECK_STATUS;
2710         REGEX_ASSERT(result == &destText);
2711         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2712     }
2713     {
2714       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715         utext_openUTF8(&dataText, str_abc, -1, &status);
2716         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717         utext_openUTF8(&replText, str_U00010000, -1, &status);
2718         matcher->reset(&dataText);
2719
2720         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2721         //                          0123456789
2722         expected[2] = 0xF0;
2723         expected[3] = 0x90;
2724         expected[4] = 0x80;
2725         expected[5] = 0x80;
2726
2727         result = matcher->replaceAll(&replText, NULL, status);
2728         REGEX_CHECK_STATUS;
2729         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2730         utext_close(result);
2731         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2732         result = matcher->replaceAll(&replText, &destText, status);
2733         REGEX_CHECK_STATUS;
2734         REGEX_ASSERT(result == &destText);
2735         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2736     }
2737     // TODO:  need more through testing of capture substitutions.
2738
2739     // Bug 4057
2740     //
2741     {
2742         status = U_ZERO_ERROR;
2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746         utext_openUTF8(&re, str_ssee, -1, &status);
2747         utext_openUTF8(&dataText, str_blah, -1, &status);
2748         utext_openUTF8(&replText, str_ooh, -1, &status);
2749
2750         RegexMatcher m(&re, 0, status);
2751         REGEX_CHECK_STATUS;
2752
2753         UnicodeString result;
2754         UText resultText = UTEXT_INITIALIZER;
2755         utext_openUnicodeString(&resultText, &result, &status);
2756
2757         // Multiple finds do NOT bump up the previous appendReplacement postion.
2758         m.reset(&dataText);
2759         m.find();
2760         m.find();
2761         m.appendReplacement(&resultText, &replText, status);
2762         REGEX_CHECK_STATUS;
2763         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2765
2766         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767         status = U_ZERO_ERROR;
2768         result.truncate(0);
2769         utext_openUnicodeString(&resultText, &result, &status);
2770         m.reset(10, status);
2771         m.find();
2772         m.find();
2773         m.appendReplacement(&resultText, &replText, status);
2774         REGEX_CHECK_STATUS;
2775         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2777
2778         // find() at interior of string, appendReplacement still starts at beginning.
2779         status = U_ZERO_ERROR;
2780         result.truncate(0);
2781         utext_openUnicodeString(&resultText, &result, &status);
2782         m.reset();
2783         m.find(10, status);
2784         m.find();
2785         m.appendReplacement(&resultText, &replText, status);
2786         REGEX_CHECK_STATUS;
2787         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2789
2790         m.appendTail(&resultText, status);
2791         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2793
2794         utext_close(&resultText);
2795     }
2796
2797     delete matcher2;
2798     delete pat2;
2799     delete matcher;
2800     delete pat;
2801
2802     utext_close(&dataText);
2803     utext_close(&replText);
2804     utext_close(&destText);
2805     utext_close(&re);
2806 }
2807
2808
2809 //---------------------------------------------------------------------------
2810 //
2811 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2812 //                        present and nominally working.
2813 //
2814 //---------------------------------------------------------------------------
2815 void RegexTest::API_Pattern_UTF8() {
2816     RegexPattern        pata;    // Test default constructor to not crash.
2817     RegexPattern        patb;
2818
2819     REGEX_ASSERT(pata == patb);
2820     REGEX_ASSERT(pata == pata);
2821
2822     UText         re1 = UTEXT_INITIALIZER;
2823     UText         re2 = UTEXT_INITIALIZER;
2824     UErrorCode    status = U_ZERO_ERROR;
2825     UParseError   pe;
2826
2827     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2830     utext_openUTF8(&re2, str_def, -1, &status);
2831
2832     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2833     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2834     REGEX_CHECK_STATUS;
2835     REGEX_ASSERT(*pat1 == *pat1);
2836     REGEX_ASSERT(*pat1 != pata);
2837
2838     // Assign
2839     patb = *pat1;
2840     REGEX_ASSERT(patb == *pat1);
2841
2842     // Copy Construct
2843     RegexPattern patc(*pat1);
2844     REGEX_ASSERT(patc == *pat1);
2845     REGEX_ASSERT(patb == patc);
2846     REGEX_ASSERT(pat1 != pat2);
2847     patb = *pat2;
2848     REGEX_ASSERT(patb != patc);
2849     REGEX_ASSERT(patb == *pat2);
2850
2851     // Compile with no flags.
2852     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2853     REGEX_ASSERT(*pat1a == *pat1);
2854
2855     REGEX_ASSERT(pat1a->flags() == 0);
2856
2857     // Compile with different flags should be not equal
2858     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2859     REGEX_CHECK_STATUS;
2860
2861     REGEX_ASSERT(*pat1b != *pat1a);
2862     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2863     REGEX_ASSERT(pat1a->flags() == 0);
2864     delete pat1b;
2865
2866     // clone
2867     RegexPattern *pat1c = pat1->clone();
2868     REGEX_ASSERT(*pat1c == *pat1);
2869     REGEX_ASSERT(*pat1c != *pat2);
2870
2871     delete pat1c;
2872     delete pat1a;
2873     delete pat1;
2874     delete pat2;
2875
2876     utext_close(&re1);
2877     utext_close(&re2);
2878
2879
2880     //
2881     //   Verify that a matcher created from a cloned pattern works.
2882     //     (Jitterbug 3423)
2883     //
2884     {
2885         UErrorCode     status     = U_ZERO_ERROR;
2886         UText          pattern    = UTEXT_INITIALIZER;
2887         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888         utext_openUTF8(&pattern, str_pL, -1, &status);
2889
2890         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2891         RegexPattern  *pClone     = pSource->clone();
2892         delete         pSource;
2893         RegexMatcher  *mFromClone = pClone->matcher(status);
2894         REGEX_CHECK_STATUS;
2895
2896         UText          input      = UTEXT_INITIALIZER;
2897         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2899         mFromClone->reset(&input);
2900         REGEX_ASSERT(mFromClone->find() == TRUE);
2901         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2902         REGEX_ASSERT(mFromClone->find() == TRUE);
2903         REGEX_ASSERT(mFromClone->group(status) == "World");
2904         REGEX_ASSERT(mFromClone->find() == FALSE);
2905         delete mFromClone;
2906         delete pClone;
2907
2908         utext_close(&input);
2909         utext_close(&pattern);
2910     }
2911
2912     //
2913     //   matches convenience API
2914     //
2915     {
2916         UErrorCode status  = U_ZERO_ERROR;
2917         UText      pattern = UTEXT_INITIALIZER;
2918         UText      input   = UTEXT_INITIALIZER;
2919
2920         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921         utext_openUTF8(&input, str_randominput, -1, &status);
2922
2923         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2925         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2926         REGEX_CHECK_STATUS;
2927
2928         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929         utext_openUTF8(&pattern, str_abc, -1, &status);
2930         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2931         REGEX_CHECK_STATUS;
2932
2933         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934         utext_openUTF8(&pattern, str_nput, -1, &status);
2935         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2936         REGEX_CHECK_STATUS;
2937
2938         utext_openUTF8(&pattern, str_randominput, -1, &status);
2939         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2940         REGEX_CHECK_STATUS;
2941
2942         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943         utext_openUTF8(&pattern, str_u, -1, &status);
2944         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2945         REGEX_CHECK_STATUS;
2946
2947         utext_openUTF8(&input, str_abc, -1, &status);
2948         utext_openUTF8(&pattern, str_abc, -1, &status);
2949         status = U_INDEX_OUTOFBOUNDS_ERROR;
2950         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2952
2953         utext_close(&input);
2954         utext_close(&pattern);
2955     }
2956
2957
2958     //
2959     // Split()
2960     //
2961     status = U_ZERO_ERROR;
2962     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2963     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2964     pat1 = RegexPattern::compile(&re1, pe, status);
2965     REGEX_CHECK_STATUS;
2966     UnicodeString  fields[10];
2967
2968     int32_t n;
2969     n = pat1->split("Now is the time", fields, 10, status);
2970     REGEX_CHECK_STATUS;
2971     REGEX_ASSERT(n==4);
2972     REGEX_ASSERT(fields[0]=="Now");
2973     REGEX_ASSERT(fields[1]=="is");
2974     REGEX_ASSERT(fields[2]=="the");
2975     REGEX_ASSERT(fields[3]=="time");
2976     REGEX_ASSERT(fields[4]=="");
2977
2978     n = pat1->split("Now is the time", fields, 2, status);
2979     REGEX_CHECK_STATUS;
2980     REGEX_ASSERT(n==2);
2981     REGEX_ASSERT(fields[0]=="Now");
2982     REGEX_ASSERT(fields[1]=="is the time");
2983     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2984
2985     fields[1] = "*";
2986     status = U_ZERO_ERROR;
2987     n = pat1->split("Now is the time", fields, 1, status);
2988     REGEX_CHECK_STATUS;
2989     REGEX_ASSERT(n==1);
2990     REGEX_ASSERT(fields[0]=="Now is the time");
2991     REGEX_ASSERT(fields[1]=="*");
2992     status = U_ZERO_ERROR;
2993
2994     n = pat1->split("    Now       is the time   ", fields, 10, status);
2995     REGEX_CHECK_STATUS;
2996     REGEX_ASSERT(n==6);
2997     REGEX_ASSERT(fields[0]=="");
2998     REGEX_ASSERT(fields[1]=="Now");
2999     REGEX_ASSERT(fields[2]=="is");
3000     REGEX_ASSERT(fields[3]=="the");
3001     REGEX_ASSERT(fields[4]=="time");
3002     REGEX_ASSERT(fields[5]=="");
3003     REGEX_ASSERT(fields[6]=="");
3004
3005     fields[2] = "*";
3006     n = pat1->split("     ", fields, 10, status);
3007     REGEX_CHECK_STATUS;
3008     REGEX_ASSERT(n==2);
3009     REGEX_ASSERT(fields[0]=="");
3010     REGEX_ASSERT(fields[1]=="");
3011     REGEX_ASSERT(fields[2]=="*");
3012
3013     fields[0] = "foo";
3014     n = pat1->split("", fields, 10, status);
3015     REGEX_CHECK_STATUS;
3016     REGEX_ASSERT(n==0);
3017     REGEX_ASSERT(fields[0]=="foo");
3018
3019     delete pat1;
3020
3021     //  split, with a pattern with (capture)
3022     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3023     pat1 = RegexPattern::compile(&re1,  pe, status);
3024     REGEX_CHECK_STATUS;
3025
3026     status = U_ZERO_ERROR;
3027     fields[6] = fields[7] = "*";
3028     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3029     REGEX_CHECK_STATUS;
3030     REGEX_ASSERT(n==7);
3031     REGEX_ASSERT(fields[0]=="");
3032     REGEX_ASSERT(fields[1]=="a");
3033     REGEX_ASSERT(fields[2]=="Now is ");
3034     REGEX_ASSERT(fields[3]=="b");
3035     REGEX_ASSERT(fields[4]=="the time");
3036     REGEX_ASSERT(fields[5]=="c");
3037     REGEX_ASSERT(fields[6]=="");
3038     REGEX_ASSERT(fields[7]=="*");
3039     REGEX_ASSERT(status==U_ZERO_ERROR);
3040
3041     fields[6] = fields[7] = "*";
3042     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3043     REGEX_CHECK_STATUS;
3044     REGEX_ASSERT(n==7);
3045     REGEX_ASSERT(fields[0]=="  ");
3046     REGEX_ASSERT(fields[1]=="a");
3047     REGEX_ASSERT(fields[2]=="Now is ");
3048     REGEX_ASSERT(fields[3]=="b");
3049     REGEX_ASSERT(fields[4]=="the time");
3050     REGEX_ASSERT(fields[5]=="c");
3051     REGEX_ASSERT(fields[6]=="");
3052     REGEX_ASSERT(fields[7]=="*");
3053
3054     status = U_ZERO_ERROR;
3055     fields[6] = "foo";
3056     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3057     REGEX_CHECK_STATUS;
3058     REGEX_ASSERT(n==6);
3059     REGEX_ASSERT(fields[0]=="  ");
3060     REGEX_ASSERT(fields[1]=="a");
3061     REGEX_ASSERT(fields[2]=="Now is ");
3062     REGEX_ASSERT(fields[3]=="b");
3063     REGEX_ASSERT(fields[4]=="the time");
3064     REGEX_ASSERT(fields[5]==" ");
3065     REGEX_ASSERT(fields[6]=="foo");
3066
3067     status = U_ZERO_ERROR;
3068     fields[5] = "foo";
3069     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3070     REGEX_CHECK_STATUS;
3071     REGEX_ASSERT(n==5);
3072     REGEX_ASSERT(fields[0]=="  ");
3073     REGEX_ASSERT(fields[1]=="a");
3074     REGEX_ASSERT(fields[2]=="Now is ");
3075     REGEX_ASSERT(fields[3]=="b");
3076     REGEX_ASSERT(fields[4]=="the time<c>");
3077     REGEX_ASSERT(fields[5]=="foo");
3078
3079     status = U_ZERO_ERROR;
3080     fields[5] = "foo";
3081     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3082     REGEX_CHECK_STATUS;
3083     REGEX_ASSERT(n==5);
3084     REGEX_ASSERT(fields[0]=="  ");
3085     REGEX_ASSERT(fields[1]=="a");
3086     REGEX_ASSERT(fields[2]=="Now is ");
3087     REGEX_ASSERT(fields[3]=="b");
3088     REGEX_ASSERT(fields[4]=="the time");
3089     REGEX_ASSERT(fields[5]=="foo");
3090
3091     status = U_ZERO_ERROR;
3092     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3093     REGEX_CHECK_STATUS;
3094     REGEX_ASSERT(n==4);
3095     REGEX_ASSERT(fields[0]=="  ");
3096     REGEX_ASSERT(fields[1]=="a");
3097     REGEX_ASSERT(fields[2]=="Now is ");
3098     REGEX_ASSERT(fields[3]=="the time<c>");
3099     status = U_ZERO_ERROR;
3100     delete pat1;
3101
3102     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3103     pat1 = RegexPattern::compile(&re1, pe, status);
3104     REGEX_CHECK_STATUS;
3105     n = pat1->split("1-10,20", fields, 10, status);
3106     REGEX_CHECK_STATUS;
3107     REGEX_ASSERT(n==5);
3108     REGEX_ASSERT(fields[0]=="1");
3109     REGEX_ASSERT(fields[1]=="-");
3110     REGEX_ASSERT(fields[2]=="10");
3111     REGEX_ASSERT(fields[3]==",");
3112     REGEX_ASSERT(fields[4]=="20");
3113     delete pat1;
3114
3115
3116     //
3117     // split of a UText based string, with library allocating output UTexts.
3118     //
3119     {
3120         status = U_ZERO_ERROR;
3121         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122         UnicodeString stringToSplit("first:second:third");
3123         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3124         REGEX_CHECK_STATUS;
3125
3126         UText *splits[10] = {NULL};
3127         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3128         REGEX_CHECK_STATUS;
3129         REGEX_ASSERT(numFields == 5);
3130         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135         REGEX_ASSERT(splits[5] == NULL);
3136
3137         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138             if (splits[i]) {
3139                 utext_close(splits[i]);
3140                 splits[i] = NULL;
3141             }
3142         }
3143         utext_close(textToSplit);
3144     }
3145
3146
3147     //
3148     // RegexPattern::pattern() and patternText()
3149     //
3150     pat1 = new RegexPattern();
3151     REGEX_ASSERT(pat1->pattern() == "");
3152     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3153     delete pat1;
3154     const char *helloWorldInvariant = "(Hello, world)*";
3155     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3156     pat1 = RegexPattern::compile(&re1, pe, status);
3157     REGEX_CHECK_STATUS;
3158     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3159     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3160     delete pat1;
3161
3162     utext_close(&re1);
3163 }
3164
3165
3166 //---------------------------------------------------------------------------
3167 //
3168 //      Extended       A more thorough check for features of regex patterns
3169 //                     The test cases are in a separate data file,
3170 //                       source/tests/testdata/regextst.txt
3171 //                     A description of the test data format is included in that file.
3172 //
3173 //---------------------------------------------------------------------------
3174
3175 const char *
3176 RegexTest::getPath(char buffer[2048], const char *filename) {
3177     UErrorCode status=U_ZERO_ERROR;
3178     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3179     if (U_FAILURE(status)) {
3180         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3181         return NULL;
3182     }
3183
3184     strcpy(buffer, testDataDirectory);
3185     strcat(buffer, filename);
3186     return buffer;
3187 }
3188
3189 void RegexTest::Extended() {
3190     char tdd[2048];
3191     const char *srcPath;
3192     UErrorCode  status  = U_ZERO_ERROR;
3193     int32_t     lineNum = 0;
3194
3195     //
3196     //  Open and read the test data file.
3197     //
3198     srcPath=getPath(tdd, "regextst.txt");
3199     if(srcPath==NULL) {
3200         return; /* something went wrong, error already output */
3201     }
3202
3203     int32_t    len;
3204     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3205     if (U_FAILURE(status)) {
3206         return; /* something went wrong, error already output */
3207     }
3208
3209     //
3210     //  Put the test data into a UnicodeString
3211     //
3212     UnicodeString testString(FALSE, testData, len);
3213
3214     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3215     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3216     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3217
3218     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3219     UnicodeString   testPattern;   // The pattern for test from the test file.
3220     UnicodeString   testFlags;     // the flags   for a test.
3221     UnicodeString   matchString;   // The marked up string to be used as input
3222
3223     if (U_FAILURE(status)){
3224         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3225         delete [] testData;
3226         return;
3227     }
3228
3229     //
3230     //  Loop over the test data file, once per line.
3231     //
3232     while (lineMat.find()) {
3233         lineNum++;
3234         if (U_FAILURE(status)) {
3235           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3236         }
3237
3238         status = U_ZERO_ERROR;
3239         UnicodeString testLine = lineMat.group(1, status);
3240         if (testLine.length() == 0) {
3241             continue;
3242         }
3243
3244         //
3245         // Parse the test line.  Skip blank and comment only lines.
3246         // Separate out the three main fields - pattern, flags, target.
3247         //
3248
3249         commentMat.reset(testLine);
3250         if (commentMat.lookingAt(status)) {
3251             // This line is a comment, or blank.
3252             continue;
3253         }
3254
3255         //
3256         //  Pull out the pattern field, remove it from the test file line.
3257         //
3258         quotedStuffMat.reset(testLine);
3259         if (quotedStuffMat.lookingAt(status)) {
3260             testPattern = quotedStuffMat.group(2, status);
3261             testLine.remove(0, quotedStuffMat.end(0, status));
3262         } else {
3263             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3264             continue;
3265         }
3266
3267
3268         //
3269         //  Pull out the flags from the test file line.
3270         //
3271         flagsMat.reset(testLine);
3272         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3273         testFlags = flagsMat.group(1, status);
3274         if (flagsMat.group(2, status).length() > 0) {
3275             errln("Bad Match flag at line %d. Scanning %c\n",
3276                 lineNum, flagsMat.group(2, status).charAt(0));
3277             continue;
3278         }
3279         testLine.remove(0, flagsMat.end(0, status));
3280
3281         //
3282         //  Pull out the match string, as a whole.
3283         //    We'll process the <tags> later.
3284         //
3285         quotedStuffMat.reset(testLine);
3286         if (quotedStuffMat.lookingAt(status)) {
3287             matchString = quotedStuffMat.group(2, status);
3288             testLine.remove(0, quotedStuffMat.end(0, status));
3289         } else {
3290             errln("Bad match string at test file line %d", lineNum);
3291             continue;
3292         }
3293
3294         //
3295         //  The only thing left from the input line should be an optional trailing comment.
3296         //
3297         commentMat.reset(testLine);
3298         if (commentMat.lookingAt(status) == FALSE) {
3299             errln("Line %d: unexpected characters at end of test line.", lineNum);
3300             continue;
3301         }
3302
3303         //
3304         //  Run the test
3305         //
3306         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3307     }
3308
3309     delete [] testData;
3310
3311 }
3312
3313
3314
3315 //---------------------------------------------------------------------------
3316 //
3317 //    regex_find(pattern, flags, inputString, lineNumber)
3318 //
3319 //         Function to run a single test from the Extended (data driven) tests.
3320 //         See file test/testdata/regextst.txt for a description of the
3321 //         pattern and inputString fields, and the allowed flags.
3322 //         lineNumber is the source line in regextst.txt of the test.
3323 //
3324 //---------------------------------------------------------------------------
3325
3326
3327 //  Set a value into a UVector at position specified by a decimal number in
3328 //   a UnicodeString.   This is a utility function needed by the actual test function,
3329 //   which follows.
3330 static void set(UVector &vec, int32_t val, UnicodeString index) {
3331     UErrorCode  status=U_ZERO_ERROR;
3332     int32_t  idx = 0;
3333     for (int32_t i=0; i<index.length(); i++) {
3334         int32_t d=u_charDigitValue(index.charAt(i));
3335         if (d<0) {return;}
3336         idx = idx*10 + d;
3337     }
3338     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3339     vec.setElementAt(val, idx);
3340 }
3341
3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3343     UErrorCode  status=U_ZERO_ERROR;
3344     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3345     vec.setElementAt(val, idx);
3346 }
3347
3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3349 {
3350     UBool couldFind = TRUE;
3351     UTEXT_SETNATIVEINDEX(utext, 0);
3352     int32_t i = 0;
3353     while (i < unistrOffset) {
3354         UChar32 c = UTEXT_NEXT32(utext);
3355         if (c != U_SENTINEL) {
3356             i += U16_LENGTH(c);
3357         } else {
3358             couldFind = FALSE;
3359             break;
3360         }
3361     }
3362     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3363     return couldFind;
3364 }
3365
3366
3367 void RegexTest::regex_find(const UnicodeString &pattern,
3368                            const UnicodeString &flags,
3369                            const UnicodeString &inputString,
3370                            const char *srcPath,
3371                            int32_t line) {
3372     UnicodeString       unEscapedInput;
3373     UnicodeString       deTaggedInput;
3374
3375     int32_t             patternUTF8Length,      inputUTF8Length;
3376     char                *patternChars  = NULL, *inputChars = NULL;
3377     UText               patternText    = UTEXT_INITIALIZER;
3378     UText               inputText      = UTEXT_INITIALIZER;
3379     UConverter          *UTF8Converter = NULL;
3380
3381     UErrorCode          status         = U_ZERO_ERROR;
3382     UParseError         pe;
3383     RegexPattern        *parsePat      = NULL;
3384     RegexMatcher        *parseMatcher  = NULL;
3385     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3386     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3387     UVector             groupStarts(status);
3388     UVector             groupEnds(status);
3389     UVector             groupStartsUTF8(status);
3390     UVector             groupEndsUTF8(status);
3391     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3392     UBool               failed         = FALSE;
3393     int32_t             numFinds;
3394     int32_t             i;
3395     UBool               useMatchesFunc   = FALSE;
3396     UBool               useLookingAtFunc = FALSE;
3397     int32_t             regionStart      = -1;
3398     int32_t             regionEnd        = -1;
3399     int32_t             regionStartUTF8  = -1;
3400     int32_t             regionEndUTF8    = -1;
3401
3402
3403     //
3404     //  Compile the caller's pattern
3405     //
3406     uint32_t bflags = 0;
3407     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3408         bflags |= UREGEX_CASE_INSENSITIVE;
3409     }
3410     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3411         bflags |= UREGEX_COMMENTS;
3412     }
3413     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3414         bflags |= UREGEX_DOTALL;
3415     }
3416     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3417         bflags |= UREGEX_MULTILINE;
3418     }
3419
3420     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3421         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3422     }
3423     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3424         bflags |= UREGEX_UNIX_LINES;
3425     }
3426     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3427         bflags |= UREGEX_LITERAL;
3428     }
3429
3430
3431     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3432     if (status != U_ZERO_ERROR) {
3433         #if UCONFIG_NO_BREAK_ITERATION==1
3434         // 'v' test flag means that the test pattern should not compile if ICU was configured
3435         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3436         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3437             goto cleanupAndReturn;
3438         }
3439         #endif
3440         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3441             // Expected pattern compilation error.
3442             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3443                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3444             }
3445             goto cleanupAndReturn;
3446         } else {
3447             // Unexpected pattern compilation error.
3448             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3449             goto cleanupAndReturn;
3450         }
3451     }
3452
3453     UTF8Converter = ucnv_open("UTF8", &status);
3454     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3455
3456     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3457     status = U_ZERO_ERROR; // buffer overflow
3458     patternChars = new char[patternUTF8Length+1];
3459     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3460     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3461
3462     if (status == U_ZERO_ERROR) {
3463         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3464
3465         if (status != U_ZERO_ERROR) {
3466 #if UCONFIG_NO_BREAK_ITERATION==1
3467             // 'v' test flag means that the test pattern should not compile if ICU was configured
3468             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3469             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3470                 goto cleanupAndReturn;
3471             }
3472 #endif
3473             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3474                 // Expected pattern compilation error.
3475                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3476                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3477                 }
3478                 goto cleanupAndReturn;
3479             } else {
3480                 // Unexpected pattern compilation error.
3481                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3482                 goto cleanupAndReturn;
3483             }
3484         }
3485     }
3486
3487     if (UTF8Pattern == NULL) {
3488         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3490         status = U_ZERO_ERROR;
3491     }
3492
3493     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3494         callerPattern->dumpPattern();
3495     }
3496
3497     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3498         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3499         goto cleanupAndReturn;
3500     }
3501
3502
3503     //
3504     // Number of times find() should be called on the test string, default to 1
3505     //
3506     numFinds = 1;
3507     for (i=2; i<=9; i++) {
3508         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3509             if (numFinds != 1) {
3510                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3511                 goto cleanupAndReturn;
3512             }
3513             numFinds = i;
3514         }
3515     }
3516
3517     // 'M' flag.  Use matches() instead of find()
3518     if (flags.indexOf((UChar)0x4d) >= 0) {
3519         useMatchesFunc = TRUE;
3520     }
3521     if (flags.indexOf((UChar)0x4c) >= 0) {
3522         useLookingAtFunc = TRUE;
3523     }
3524
3525     //
3526     //  Find the tags in the input data, remove them, and record the group boundary
3527     //    positions.
3528     //
3529     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3530     REGEX_CHECK_STATUS_L(line);
3531
3532     unEscapedInput = inputString.unescape();
3533     parseMatcher = parsePat->matcher(unEscapedInput, status);
3534     REGEX_CHECK_STATUS_L(line);
3535     while(parseMatcher->find()) {
3536         parseMatcher->appendReplacement(deTaggedInput, "", status);
3537         REGEX_CHECK_STATUS;
3538         UnicodeString groupNum = parseMatcher->group(2, status);
3539         if (groupNum == "r") {
3540             // <r> or </r>, a region specification within the string
3541             if (parseMatcher->group(1, status) == "/") {
3542                 regionEnd = deTaggedInput.length();
3543             } else {
3544                 regionStart = deTaggedInput.length();
3545             }
3546         } else {
3547             // <digits> or </digits>, a group match boundary tag.
3548             if (parseMatcher->group(1, status) == "/") {
3549                 set(groupEnds, deTaggedInput.length(), groupNum);
3550             } else {
3551                 set(groupStarts, deTaggedInput.length(), groupNum);
3552             }
3553         }
3554     }
3555     parseMatcher->appendTail(deTaggedInput);
3556     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3557     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3558       errln("mismatched <r> tags");
3559       failed = TRUE;
3560       goto cleanupAndReturn;
3561     }
3562
3563     //
3564     //  Configure the matcher according to the flags specified with this test.
3565     //
3566     matcher = callerPattern->matcher(deTaggedInput, status);
3567     REGEX_CHECK_STATUS_L(line);
3568     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3569         matcher->setTrace(TRUE);
3570     }
3571
3572     if (UTF8Pattern != NULL) {
3573         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3574         status = U_ZERO_ERROR; // buffer overflow
3575         inputChars = new char[inputUTF8Length+1];
3576         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3577         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3578
3579         if (status == U_ZERO_ERROR) {
3580             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3581             REGEX_CHECK_STATUS_L(line);
3582         }
3583
3584         if (UTF8Matcher == NULL) {
3585             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3587             status = U_ZERO_ERROR;
3588         }
3589     }
3590
3591     //
3592     //  Generate native indices for UTF8 versions of region and capture group info
3593     //
3594     if (UTF8Matcher != NULL) {
3595         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3596             UTF8Matcher->setTrace(TRUE);
3597         }
3598         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3599         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3600
3601         //  Fill out the native index UVector info.
3602         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3603         for (i=0; i<groupStarts.size(); i++) {
3604             int32_t  start = groupStarts.elementAti(i);
3605             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3606             if (start >= 0) {
3607                 int32_t  startUTF8;
3608                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3609                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3610                     failed = TRUE;
3611                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3612                 }
3613                 setInt(groupStartsUTF8, startUTF8, i);
3614             }
3615
3616             int32_t  end = groupEnds.elementAti(i);
3617             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3618             if (end >= 0) {
3619                 int32_t  endUTF8;
3620                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3621                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3622                     failed = TRUE;
3623                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3624                 }
3625                 setInt(groupEndsUTF8, endUTF8, i);
3626             }
3627         }
3628     }
3629
3630     if (regionStart>=0) {
3631        matcher->region(regionStart, regionEnd, status);
3632        REGEX_CHECK_STATUS_L(line);
3633        if (UTF8Matcher != NULL) {
3634            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3635            REGEX_CHECK_STATUS_L(line);
3636        }
3637     }
3638     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3639         matcher->useAnchoringBounds(FALSE);
3640         if (UTF8Matcher != NULL) {
3641             UTF8Matcher->useAnchoringBounds(FALSE);
3642         }
3643     }
3644     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3645         matcher->useTransparentBounds(TRUE);
3646         if (UTF8Matcher != NULL) {
3647             UTF8Matcher->useTransparentBounds(TRUE);
3648         }
3649     }
3650
3651
3652
3653     //
3654     // Do a find on the de-tagged input using the caller's pattern
3655     //     TODO: error on count>1 and not find().
3656     //           error on both matches() and lookingAt().
3657     //
3658     for (i=0; i<numFinds; i++) {
3659         if (useMatchesFunc) {
3660             isMatch = matcher->matches(status);
3661             if (UTF8Matcher != NULL) {
3662                isUTF8Match = UTF8Matcher->matches(status);
3663             }
3664         } else  if (useLookingAtFunc) {
3665             isMatch = matcher->lookingAt(status);
3666             if (UTF8Matcher != NULL) {
3667                 isUTF8Match = UTF8Matcher->lookingAt(status);
3668             }
3669         } else {
3670             isMatch = matcher->find();
3671             if (UTF8Matcher != NULL) {
3672                 isUTF8Match = UTF8Matcher->find();
3673             }
3674         }
3675     }
3676     matcher->setTrace(FALSE);
3677     if (UTF8Matcher) {
3678         UTF8Matcher->setTrace(FALSE);
3679     }
3680     if (U_FAILURE(status)) {
3681         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3682     }
3683
3684     //
3685     // Match up the groups from the find() with the groups from the tags
3686     //
3687
3688     // number of tags should match number of groups from find operation.
3689     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3690     //   G option in test means that capture group data is not available in the
3691     //     expected results, so the check needs to be suppressed.
3692     if (isMatch == FALSE && groupStarts.size() != 0) {
3693         dataerrln("Error at line %d:  Match expected, but none found.", line);
3694         failed = TRUE;
3695         goto cleanupAndReturn;
3696     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3697         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3698         failed = TRUE;
3699         goto cleanupAndReturn;
3700     }
3701     if (isMatch && groupStarts.size() == 0) {
3702         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3703         failed = TRUE;
3704     }
3705     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3706         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3707         failed = TRUE;
3708     }
3709
3710     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3711         // Only check for match / no match.  Don't check capture groups.
3712         goto cleanupAndReturn;
3713     }
3714
3715     REGEX_CHECK_STATUS_L(line);
3716     for (i=0; i<=matcher->groupCount(); i++) {
3717         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3718         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3719         if (matcher->start(i, status) != expectedStart) {
3720             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3721                 line, i, expectedStart, matcher->start(i, status));
3722             failed = TRUE;
3723             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3724         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3725             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3726                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3727             failed = TRUE;
3728             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3729         }
3730
3731         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3732         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3733         if (matcher->end(i, status) != expectedEnd) {
3734             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3735                 line, i, expectedEnd, matcher->end(i, status));
3736             failed = TRUE;
3737             // Error on end position;  keep going; real error is probably yet to come as group
3738             //   end positions work from end of the input data towards the front.
3739         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3740             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3741                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3742             failed = TRUE;
3743             // Error on end position;  keep going; real error is probably yet to come as group
3744             //   end positions work from end of the input data towards the front.
3745         }
3746     }
3747     if ( matcher->groupCount()+1 < groupStarts.size()) {
3748         errln("Error at line %d: Expected %d capture groups, found %d.",
3749             line, groupStarts.size()-1, matcher->groupCount());
3750         failed = TRUE;
3751         }
3752     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3753         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3754               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3755         failed = TRUE;
3756     }
3757
3758     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3759         matcher->requireEnd() == TRUE) {
3760         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3761         failed = TRUE;
3762     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3763         UTF8Matcher->requireEnd() == TRUE) {
3764         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3765         failed = TRUE;
3766     }
3767
3768     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3769         matcher->requireEnd() == FALSE) {
3770         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3771         failed = TRUE;
3772     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3773         UTF8Matcher->requireEnd() == FALSE) {
3774         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3775         failed = TRUE;
3776     }
3777
3778     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3779         matcher->hitEnd() == TRUE) {
3780         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3781         failed = TRUE;
3782     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3783                UTF8Matcher->hitEnd() == TRUE) {
3784         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3785         failed = TRUE;
3786     }
3787
3788     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3789         matcher->hitEnd() == FALSE) {
3790         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3791         failed = TRUE;
3792     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3793                UTF8Matcher->hitEnd() == FALSE) {
3794         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3795         failed = TRUE;
3796     }
3797
3798
3799 cleanupAndReturn:
3800     if (failed) {
3801         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3802             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3803         // callerPattern->dump();
3804     }
3805     delete parseMatcher;
3806     delete parsePat;
3807     delete UTF8Matcher;
3808     delete UTF8Pattern;
3809     delete matcher;
3810     delete callerPattern;
3811
3812     utext_close(&inputText);
3813     delete[] inputChars;
3814     utext_close(&patternText);
3815     delete[] patternChars;
3816     ucnv_close(UTF8Converter);
3817 }
3818
3819
3820
3821
3822 //---------------------------------------------------------------------------
3823 //
3824 //      Errors     Check for error handling in patterns.
3825 //
3826 //---------------------------------------------------------------------------
3827 void RegexTest::Errors() {
3828     // \escape sequences that aren't implemented yet.
3829     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3830
3831     // Missing close parentheses
3832     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3833     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3834     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3835
3836     // Extra close paren
3837     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3838     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3839     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3840
3841     // Look-ahead, Look-behind
3842     //  TODO:  add tests for unbounded length look-behinds.
3843     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3844
3845     // Attempt to use non-default flags
3846     {
3847         UParseError   pe;
3848         UErrorCode    status = U_ZERO_ERROR;
3849         int32_t       flags  = UREGEX_CANON_EQ |
3850                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3851                                UREGEX_MULTILINE;
3852         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3853         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3854         delete pat1;
3855     }
3856
3857
3858     // Quantifiers are allowed only after something that can be quantified.
3859     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3860     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3861     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3862
3863     // Mal-formed {min,max} quantifiers
3864     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3865     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3866     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3867     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3868     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3869     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3870     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3871     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3872     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3873
3874     // Ticket 5389
3875     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3876
3877     // Invalid Back Reference \0
3878     //    For ICU 3.8 and earlier
3879     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3880     //
3881     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3882
3883 }
3884
3885
3886 //-------------------------------------------------------------------------------
3887 //
3888 //  Read a text data file, convert it to UChars, and return the data
3889 //    in one big UChar * buffer, which the caller must delete.
3890 //
3891 //--------------------------------------------------------------------------------
3892 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3893                                      const char *defEncoding, UErrorCode &status) {
3894     UChar       *retPtr  = NULL;
3895     char        *fileBuf = NULL;
3896     UConverter* conv     = NULL;
3897     FILE        *f       = NULL;
3898
3899     ulen = 0;
3900     if (U_FAILURE(status)) {
3901         return retPtr;
3902     }
3903
3904     //
3905     //  Open the file.
3906     //
3907     f = fopen(fileName, "rb");
3908     if (f == 0) {
3909         dataerrln("Error opening test data file %s\n", fileName);
3910         status = U_FILE_ACCESS_ERROR;
3911         return NULL;
3912     }
3913     //
3914     //  Read it in
3915     //
3916     int32_t            fileSize;
3917     int32_t            amt_read;
3918
3919     fseek( f, 0, SEEK_END);
3920     fileSize = ftell(f);
3921     fileBuf = new char[fileSize];
3922     fseek(f, 0, SEEK_SET);
3923     amt_read = fread(fileBuf, 1, fileSize, f);
3924     if (amt_read != fileSize || fileSize <= 0) {
3925         errln("Error reading test data file.");
3926         goto cleanUpAndReturn;
3927     }
3928
3929     //
3930     // Look for a Unicode Signature (BOM) on the data just read
3931     //
3932     int32_t        signatureLength;
3933     const char *   fileBufC;
3934     const char*    encoding;
3935
3936     fileBufC = fileBuf;
3937     encoding = ucnv_detectUnicodeSignature(
3938         fileBuf, fileSize, &signatureLength, &status);
3939     if(encoding!=NULL ){
3940         fileBufC  += signatureLength;
3941         fileSize  -= signatureLength;
3942     } else {
3943         encoding = defEncoding;
3944         if (strcmp(encoding, "utf-8") == 0) {
3945             errln("file %s is missing its BOM", fileName);
3946         }
3947     }
3948
3949     //
3950     // Open a converter to take the rule file to UTF-16
3951     //
3952     conv = ucnv_open(encoding, &status);
3953     if (U_FAILURE(status)) {
3954         goto cleanUpAndReturn;
3955     }
3956
3957     //
3958     // Convert the rules to UChar.
3959     //  Preflight first to determine required buffer size.
3960     //
3961     ulen = ucnv_toUChars(conv,
3962         NULL,           //  dest,
3963         0,              //  destCapacity,
3964         fileBufC,
3965         fileSize,
3966         &status);
3967     if (status == U_BUFFER_OVERFLOW_ERROR) {
3968         // Buffer Overflow is expected from the preflight operation.
3969         status = U_ZERO_ERROR;
3970
3971         retPtr = new UChar[ulen+1];
3972         ucnv_toUChars(conv,
3973             retPtr,       //  dest,
3974             ulen+1,
3975             fileBufC,
3976             fileSize,
3977             &status);
3978     }
3979
3980 cleanUpAndReturn:
3981     fclose(f);
3982     delete[] fileBuf;
3983     ucnv_close(conv);
3984     if (U_FAILURE(status)) {
3985         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3986         delete []retPtr;
3987         retPtr = 0;
3988         ulen   = 0;
3989     };
3990     return retPtr;
3991 }
3992
3993
3994 //-------------------------------------------------------------------------------
3995 //
3996 //   PerlTests  - Run Perl's regular expression tests
3997 //                The input file for this test is re_tests, the standard regular
3998 //                expression test data distributed with the Perl source code.
3999 //
4000 //                Here is Perl's description of the test data file:
4001 //
4002 //        # The tests are in a separate file 't/op/re_tests'.
4003 //        # Each line in that file is a separate test.
4004 //        # There are five columns, separated by tabs.
4005 //        #
4006 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
4007 //        # Modifiers can be put after the closing C<'>.
4008 //        #
4009 //        # Column 2 contains the string to be matched.
4010 //        #
4011 //        # Column 3 contains the expected result:
4012 //        #     y   expect a match
4013 //        #     n   expect no match
4014 //        #     c   expect an error
4015 //        # B   test exposes a known bug in Perl, should be skipped
4016 //        # b   test exposes a known bug in Perl, should be skipped if noamp
4017 //        #
4018 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4019 //        #
4020 //        # Column 4 contains a string, usually C<$&>.
4021 //        #
4022 //        # Column 5 contains the expected result of double-quote
4023 //        # interpolating that string after the match, or start of error message.
4024 //        #
4025 //        # Column 6, if present, contains a reason why the test is skipped.
4026 //        # This is printed with "skipped", for harness to pick up.
4027 //        #
4028 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4029 //        #
4030 //        # If you want to add a regular expression test that can't be expressed
4031 //        # in this format, don't add it here: put it in op/pat.t instead.
4032 //
4033 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4034 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4035 //        (The i is in addition to whatever was there before.)
4036 //
4037 //-------------------------------------------------------------------------------
4038 void RegexTest::PerlTests() {
4039     char tdd[2048];
4040     const char *srcPath;
4041     UErrorCode  status = U_ZERO_ERROR;
4042     UParseError pe;
4043
4044     //
4045     //  Open and read the test data file.
4046     //
4047     srcPath=getPath(tdd, "re_tests.txt");
4048     if(srcPath==NULL) {
4049         return; /* something went wrong, error already output */
4050     }
4051
4052     int32_t    len;
4053     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4054     if (U_FAILURE(status)) {
4055         return; /* something went wrong, error already output */
4056     }
4057
4058     //
4059     //  Put the test data into a UnicodeString
4060     //
4061     UnicodeString testDataString(FALSE, testData, len);
4062
4063     //
4064     //  Regex to break the input file into lines, and strip the new lines.
4065     //     One line per match, capture group one is the desired data.
4066     //
4067     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4068     if (U_FAILURE(status)) {
4069         dataerrln("RegexPattern::compile() error");
4070         return;
4071     }
4072     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4073
4074     //
4075     //  Regex to split a test file line into fields.
4076     //    There are six fields, separated by tabs.
4077     //
4078     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4079
4080     //
4081     //  Regex to identify test patterns with flag settings, and to separate them.
4082     //    Test patterns with flags look like 'pattern'i
4083     //    Test patterns without flags are not quoted:   pattern
4084     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4085     //
4086     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4087     RegexMatcher* flagMat = flagPat->matcher(status);
4088
4089     //
4090     // The Perl tests reference several perl-isms, which are evaluated/substituted
4091     //   in the test data.  Not being perl, this must be done explicitly.  Here
4092     //   are string constants and REs for these constructs.
4093     //
4094     UnicodeString nulnulSrc("${nulnul}");
4095     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4096     nulnul = nulnul.unescape();
4097
4098     UnicodeString ffffSrc("${ffff}");
4099     UnicodeString ffff("\\uffff", -1, US_INV);
4100     ffff = ffff.unescape();
4101
4102     //  regexp for $-[0], $+[2], etc.
4103     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4104     RegexMatcher *groupsMat = groupsPat->matcher(status);
4105
4106     //  regexp for $0, $1, $2, etc.
4107     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4108     RegexMatcher *cgMat = cgPat->matcher(status);
4109
4110
4111     //
4112     // Main Loop for the Perl Tests, runs once per line from the
4113     //   test data file.
4114     //
4115     int32_t  lineNum = 0;
4116     int32_t  skippedUnimplementedCount = 0;
4117     while (lineMat->find()) {
4118         lineNum++;
4119
4120         //
4121         //  Get a line, break it into its fields, do the Perl
4122         //    variable substitutions.
4123         //
4124         UnicodeString line = lineMat->group(1, status);
4125         UnicodeString fields[7];
4126         fieldPat->split(line, fields, 7, status);
4127
4128         flagMat->reset(fields[0]);
4129         flagMat->matches(status);
4130         UnicodeString pattern  = flagMat->group(2, status);
4131         pattern.findAndReplace("${bang}", "!");
4132         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4133         pattern.findAndReplace(ffffSrc, ffff);
4134
4135         //
4136         //  Identify patterns that include match flag settings,
4137         //    split off the flags, remove the extra quotes.
4138         //
4139         UnicodeString flagStr = flagMat->group(3, status);
4140         if (U_FAILURE(status)) {
4141             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4142             return;
4143         }
4144         int32_t flags = 0;
4145         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4146         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4147         const UChar UChar_m = 0x6d;
4148         const UChar UChar_x = 0x78;
4149         const UChar UChar_y = 0x79;
4150         if (flagStr.indexOf(UChar_i) != -1) {
4151             flags |= UREGEX_CASE_INSENSITIVE;
4152         }
4153         if (flagStr.indexOf(UChar_m) != -1) {
4154             flags |= UREGEX_MULTILINE;
4155         }
4156         if (flagStr.indexOf(UChar_x) != -1) {
4157             flags |= UREGEX_COMMENTS;
4158         }
4159
4160         //
4161         // Compile the test pattern.
4162         //
4163         status = U_ZERO_ERROR;
4164         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4165         if (status == U_REGEX_UNIMPLEMENTED) {
4166             //
4167             // Test of a feature that is planned for ICU, but not yet implemented.
4168             //   skip the test.
4169             skippedUnimplementedCount++;
4170             delete testPat;
4171             status = U_ZERO_ERROR;
4172             continue;
4173         }
4174
4175         if (U_FAILURE(status)) {
4176             // Some tests are supposed to generate errors.
4177             //   Only report an error for tests that are supposed to succeed.
4178             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4179                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4180             {
4181                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4182             }
4183             status = U_ZERO_ERROR;
4184             delete testPat;
4185             continue;
4186         }
4187
4188         if (fields[2].indexOf(UChar_i) >= 0) {
4189             // ICU should skip this test.
4190             delete testPat;
4191             continue;
4192         }
4193
4194         if (fields[2].indexOf(UChar_c) >= 0) {
4195             // This pattern should have caused a compilation error, but didn't/
4196             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4197             delete testPat;
4198             continue;
4199         }
4200
4201         //
4202         // replace the Perl variables that appear in some of the
4203         //   match data strings.
4204         //
4205         UnicodeString matchString = fields[1];
4206         matchString.findAndReplace(nulnulSrc, nulnul);
4207         matchString.findAndReplace(ffffSrc,   ffff);
4208
4209         // Replace any \n in the match string with an actual new-line char.
4210         //  Don't do full unescape, as this unescapes more than Perl does, which
4211         //  causes other spurious failures in the tests.
4212         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4213
4214
4215
4216         //
4217         // Run the test, check for expected match/don't match result.
4218         //
4219         RegexMatcher *testMat = testPat->matcher(matchString, status);
4220         UBool found = testMat->find();
4221         UBool expected = FALSE;
4222         if (fields[2].indexOf(UChar_y) >=0) {
4223             expected = TRUE;
4224         }
4225         if (expected != found) {
4226             errln("line %d: Expected %smatch, got %smatch",
4227                 lineNum, expected?"":"no ", found?"":"no " );
4228             continue;
4229         }
4230
4231         // Don't try to check expected results if there is no match.
4232         //   (Some have stuff in the expected fields)
4233         if (!found) {
4234             delete testMat;
4235             delete testPat;
4236             continue;
4237         }
4238
4239         //
4240         // Interpret the Perl expression from the fourth field of the data file,
4241         // building up an ICU string from the results of the ICU match.
4242         //   The Perl expression will contain references to the results of
4243         //     a regex match, including the matched string, capture group strings,
4244         //     group starting and ending indicies, etc.
4245         //
4246         UnicodeString resultString;
4247         UnicodeString perlExpr = fields[3];
4248 #if SUPPORT_MUTATING_INPUT_STRING
4249         groupsMat->reset(perlExpr);
4250         cgMat->reset(perlExpr);
4251 #endif
4252
4253         while (perlExpr.length() > 0) {
4254 #if !SUPPORT_MUTATING_INPUT_STRING
4255             //  Perferred usage.  Reset after any modification to input string.
4256             groupsMat->reset(perlExpr);
4257             cgMat->reset(perlExpr);
4258 #endif
4259
4260             if (perlExpr.startsWith("$&")) {
4261                 resultString.append(testMat->group(status));
4262                 perlExpr.remove(0, 2);
4263             }
4264
4265             else if (groupsMat->lookingAt(status)) {
4266                 // $-[0]   $+[2]  etc.
4267                 UnicodeString digitString = groupsMat->group(2, status);
4268                 int32_t t = 0;
4269                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4270                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4271                 int32_t matchPosition;
4272                 if (plusOrMinus.compare("+") == 0) {
4273                     matchPosition = testMat->end(groupNum, status);
4274                 } else {
4275                     matchPosition = testMat->start(groupNum, status);
4276                 }
4277                 if (matchPosition != -1) {
4278                     ICU_Utility::appendNumber(resultString, matchPosition);
4279                 }
4280                 perlExpr.remove(0, groupsMat->end(status));
4281             }
4282
4283             else if (cgMat->lookingAt(status)) {
4284                 // $1, $2, $3, etc.
4285                 UnicodeString digitString = cgMat->group(1, status);
4286                 int32_t t = 0;
4287                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4288                 if (U_SUCCESS(status)) {
4289                     resultString.append(testMat->group(groupNum, status));
4290                     status = U_ZERO_ERROR;
4291                 }
4292                 perlExpr.remove(0, cgMat->end(status));
4293             }
4294
4295             else if (perlExpr.startsWith("@-")) {
4296                 int32_t i;
4297                 for (i=0; i<=testMat->groupCount(); i++) {
4298                     if (i>0) {
4299                         resultString.append(" ");
4300                     }
4301                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4302                 }
4303                 perlExpr.remove(0, 2);
4304             }
4305
4306             else if (perlExpr.startsWith("@+")) {
4307                 int32_t i;
4308                 for (i=0; i<=testMat->groupCount(); i++) {
4309                     if (i>0) {
4310                         resultString.append(" ");
4311                     }
4312                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4313                 }
4314                 perlExpr.remove(0, 2);
4315             }
4316
4317             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4318                                                      //           or as an escaped sequence (e.g. \n)
4319                 if (perlExpr.length() > 1) {
4320                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4321                 }
4322                 UChar c = perlExpr.charAt(0);
4323                 switch (c) {
4324                 case 'n':   c = '\n'; break;
4325                 // add any other escape sequences that show up in the test expected results.
4326                 }
4327                 resultString.append(c);
4328                 perlExpr.remove(0, 1);
4329             }
4330
4331             else  {
4332                 // Any characters from the perl expression that we don't explicitly
4333                 //  recognize before here are assumed to be literals and copied
4334                 //  as-is to the expected results.
4335                 resultString.append(perlExpr.charAt(0));
4336                 perlExpr.remove(0, 1);
4337             }
4338
4339             if (U_FAILURE(status)) {
4340                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4341                 break;
4342             }
4343         }
4344
4345         //
4346         // Expected Results Compare
4347         //
4348         UnicodeString expectedS(fields[4]);
4349         expectedS.findAndReplace(nulnulSrc, nulnul);
4350         expectedS.findAndReplace(ffffSrc,   ffff);
4351         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4352
4353
4354         if (expectedS.compare(resultString) != 0) {
4355             err("Line %d: Incorrect perl expression results.", lineNum);
4356             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4357         }
4358
4359         delete testMat;
4360         delete testPat;
4361     }
4362
4363     //
4364     // All done.  Clean up allocated stuff.
4365     //
4366     delete cgMat;
4367     delete cgPat;
4368
4369     delete groupsMat;
4370     delete groupsPat;
4371
4372     delete flagMat;
4373     delete flagPat;
4374
4375     delete lineMat;
4376     delete linePat;
4377
4378     delete fieldPat;
4379     delete [] testData;
4380
4381
4382     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4383
4384 }
4385
4386
4387 //-------------------------------------------------------------------------------
4388 //
4389 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4390 //                  (instead of using UnicodeStrings) to test the alternate engine.
4391 //                  The input file for this test is re_tests, the standard regular
4392 //                  expression test data distributed with the Perl source code.
4393 //                  See PerlTests() for more information.
4394 //
4395 //-------------------------------------------------------------------------------
4396 void RegexTest::PerlTestsUTF8() {
4397     char tdd[2048];
4398     const char *srcPath;
4399     UErrorCode  status = U_ZERO_ERROR;
4400     UParseError pe;
4401     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4402     UText       patternText = UTEXT_INITIALIZER;
4403     char       *patternChars = NULL;
4404     int32_t     patternLength;
4405     int32_t     patternCapacity = 0;
4406     UText       inputText = UTEXT_INITIALIZER;
4407     char       *inputChars = NULL;
4408     int32_t     inputLength;
4409     int32_t     inputCapacity = 0;
4410
4411     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4412
4413     //
4414     //  Open and read the test data file.
4415     //
4416     srcPath=getPath(tdd, "re_tests.txt");
4417     if(srcPath==NULL) {
4418         return; /* something went wrong, error already output */
4419     }
4420
4421     int32_t    len;
4422     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4423     if (U_FAILURE(status)) {
4424         return; /* something went wrong, error already output */
4425     }
4426
4427     //
4428     //  Put the test data into a UnicodeString
4429     //
4430     UnicodeString testDataString(FALSE, testData, len);
4431
4432     //
4433     //  Regex to break the input file into lines, and strip the new lines.
4434     //     One line per match, capture group one is the desired data.
4435     //
4436     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4437     if (U_FAILURE(status)) {
4438         dataerrln("RegexPattern::compile() error");
4439         return;
4440     }
4441     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4442
4443     //
4444     //  Regex to split a test file line into fields.
4445     //    There are six fields, separated by tabs.
4446     //
4447     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4448
4449     //
4450     //  Regex to identify test patterns with flag settings, and to separate them.
4451     //    Test patterns with flags look like 'pattern'i
4452     //    Test patterns without flags are not quoted:   pattern
4453     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4454     //
4455     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4456     RegexMatcher* flagMat = flagPat->matcher(status);
4457
4458     //
4459     // The Perl tests reference several perl-isms, which are evaluated/substituted
4460     //   in the test data.  Not being perl, this must be done explicitly.  Here
4461     //   are string constants and REs for these constructs.
4462     //
4463     UnicodeString nulnulSrc("${nulnul}");
4464     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4465     nulnul = nulnul.unescape();
4466
4467     UnicodeString ffffSrc("${ffff}");
4468     UnicodeString ffff("\\uffff", -1, US_INV);
4469     ffff = ffff.unescape();
4470
4471     //  regexp for $-[0], $+[2], etc.
4472     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4473     RegexMatcher *groupsMat = groupsPat->matcher(status);
4474
4475     //  regexp for $0, $1, $2, etc.
4476     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4477     RegexMatcher *cgMat = cgPat->matcher(status);
4478
4479
4480     //
4481     // Main Loop for the Perl Tests, runs once per line from the
4482     //   test data file.
4483     //
4484     int32_t  lineNum = 0;
4485     int32_t  skippedUnimplementedCount = 0;
4486     while (lineMat->find()) {
4487         lineNum++;
4488
4489         //
4490         //  Get a line, break it into its fields, do the Perl
4491         //    variable substitutions.
4492         //
4493         UnicodeString line = lineMat->group(1, status);
4494         UnicodeString fields[7];
4495         fieldPat->split(line, fields, 7, status);
4496
4497         flagMat->reset(fields[0]);
4498         flagMat->matches(status);
4499         UnicodeString pattern  = flagMat->group(2, status);
4500         pattern.findAndReplace("${bang}", "!");
4501         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4502         pattern.findAndReplace(ffffSrc, ffff);
4503
4504         //
4505         //  Identify patterns that include match flag settings,
4506         //    split off the flags, remove the extra quotes.
4507         //
4508         UnicodeString flagStr = flagMat->group(3, status);
4509         if (U_FAILURE(status)) {
4510             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4511             return;
4512         }
4513         int32_t flags = 0;
4514         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4515         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4516         const UChar UChar_m = 0x6d;
4517         const UChar UChar_x = 0x78;
4518         const UChar UChar_y = 0x79;
4519         if (flagStr.indexOf(UChar_i) != -1) {
4520             flags |= UREGEX_CASE_INSENSITIVE;
4521         }
4522         if (flagStr.indexOf(UChar_m) != -1) {
4523             flags |= UREGEX_MULTILINE;
4524         }
4525         if (flagStr.indexOf(UChar_x) != -1) {
4526             flags |= UREGEX_COMMENTS;
4527         }
4528
4529         //
4530         // Put the pattern in a UTF-8 UText
4531         //
4532         status = U_ZERO_ERROR;
4533         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4534         if (status == U_BUFFER_OVERFLOW_ERROR) {
4535             status = U_ZERO_ERROR;
4536             delete[] patternChars;
4537             patternCapacity = patternLength + 1;
4538             patternChars = new char[patternCapacity];
4539             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4540         }
4541         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4542
4543         //
4544         // Compile the test pattern.
4545         //
4546         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4547         if (status == U_REGEX_UNIMPLEMENTED) {
4548             //
4549             // Test of a feature that is planned for ICU, but not yet implemented.
4550             //   skip the test.
4551             skippedUnimplementedCount++;
4552             delete testPat;
4553             status = U_ZERO_ERROR;
4554             continue;
4555         }
4556
4557         if (U_FAILURE(status)) {
4558             // Some tests are supposed to generate errors.
4559             //   Only report an error for tests that are supposed to succeed.
4560             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4561                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4562             {
4563                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4564             }
4565             status = U_ZERO_ERROR;
4566             delete testPat;
4567             continue;
4568         }
4569
4570         if (fields[2].indexOf(UChar_i) >= 0) {
4571             // ICU should skip this test.
4572             delete testPat;
4573             continue;
4574         }
4575
4576         if (fields[2].indexOf(UChar_c) >= 0) {
4577             // This pattern should have caused a compilation error, but didn't/
4578             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4579             delete testPat;
4580             continue;
4581         }
4582
4583
4584         //
4585         // replace the Perl variables that appear in some of the
4586         //   match data strings.
4587         //
4588         UnicodeString matchString = fields[1];
4589         matchString.findAndReplace(nulnulSrc, nulnul);
4590         matchString.findAndReplace(ffffSrc,   ffff);
4591
4592         // Replace any \n in the match string with an actual new-line char.
4593         //  Don't do full unescape, as this unescapes more than Perl does, which
4594         //  causes other spurious failures in the tests.
4595         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4596
4597         //
4598         // Put the input in a UTF-8 UText
4599         //
4600         status = U_ZERO_ERROR;
4601         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4602         if (status == U_BUFFER_OVERFLOW_ERROR) {
4603             status = U_ZERO_ERROR;
4604             delete[] inputChars;
4605             inputCapacity = inputLength + 1;
4606             inputChars = new char[inputCapacity];
4607             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4608         }
4609         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4610
4611         //
4612         // Run the test, check for expected match/don't match result.
4613         //
4614         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4615         UBool found = testMat->find();
4616         UBool expected = FALSE;
4617         if (fields[2].indexOf(UChar_y) >=0) {
4618             expected = TRUE;
4619         }
4620         if (expected != found) {
4621             errln("line %d: Expected %smatch, got %smatch",
4622                 lineNum, expected?"":"no ", found?"":"no " );
4623             continue;
4624         }
4625
4626         // Don't try to check expected results if there is no match.
4627         //   (Some have stuff in the expected fields)
4628         if (!found) {
4629             delete testMat;
4630             delete testPat;
4631             continue;
4632         }
4633
4634         //
4635         // Interpret the Perl expression from the fourth field of the data file,
4636         // building up an ICU string from the results of the ICU match.
4637         //   The Perl expression will contain references to the results of
4638         //     a regex match, including the matched string, capture group strings,
4639         //     group starting and ending indicies, etc.
4640         //
4641         UnicodeString resultString;
4642         UnicodeString perlExpr = fields[3];
4643
4644         while (perlExpr.length() > 0) {
4645             groupsMat->reset(perlExpr);
4646             cgMat->reset(perlExpr);
4647
4648             if (perlExpr.startsWith("$&")) {
4649                 resultString.append(testMat->group(status));
4650                 perlExpr.remove(0, 2);
4651             }
4652
4653             else if (groupsMat->lookingAt(status)) {
4654                 // $-[0]   $+[2]  etc.
4655                 UnicodeString digitString = groupsMat->group(2, status);
4656                 int32_t t = 0;
4657                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4658                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4659                 int32_t matchPosition;
4660                 if (plusOrMinus.compare("+") == 0) {
4661                     matchPosition = testMat->end(groupNum, status);
4662                 } else {
4663                     matchPosition = testMat->start(groupNum, status);
4664                 }
4665                 if (matchPosition != -1) {
4666                     ICU_Utility::appendNumber(resultString, matchPosition);
4667                 }
4668                 perlExpr.remove(0, groupsMat->end(status));
4669             }
4670
4671             else if (cgMat->lookingAt(status)) {
4672                 // $1, $2, $3, etc.
4673                 UnicodeString digitString = cgMat->group(1, status);
4674                 int32_t t = 0;
4675                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4676                 if (U_SUCCESS(status)) {
4677                     resultString.append(testMat->group(groupNum, status));
4678                     status = U_ZERO_ERROR;
4679                 }
4680                 perlExpr.remove(0, cgMat->end(status));
4681             }
4682
4683             else if (perlExpr.startsWith("@-")) {
4684                 int32_t i;
4685                 for (i=0; i<=testMat->groupCount(); i++) {
4686                     if (i>0) {
4687                         resultString.append(" ");
4688                     }
4689                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4690                 }
4691                 perlExpr.remove(0, 2);
4692             }
4693
4694             else if (perlExpr.startsWith("@+")) {
4695                 int32_t i;
4696                 for (i=0; i<=testMat->groupCount(); i++) {
4697                     if (i>0) {
4698                         resultString.append(" ");
4699                     }
4700                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4701                 }
4702                 perlExpr.remove(0, 2);
4703             }
4704
4705             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4706                                                      //           or as an escaped sequence (e.g. \n)
4707                 if (perlExpr.length() > 1) {
4708                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4709                 }
4710                 UChar c = perlExpr.charAt(0);
4711                 switch (c) {
4712                 case 'n':   c = '\n'; break;
4713                 // add any other escape sequences that show up in the test expected results.
4714                 }
4715                 resultString.append(c);
4716                 perlExpr.remove(0, 1);
4717             }
4718
4719             else  {
4720                 // Any characters from the perl expression that we don't explicitly
4721                 //  recognize before here are assumed to be literals and copied
4722                 //  as-is to the expected results.
4723                 resultString.append(perlExpr.charAt(0));
4724                 perlExpr.remove(0, 1);
4725             }
4726
4727             if (U_FAILURE(status)) {
4728                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4729                 break;
4730             }
4731         }
4732
4733         //
4734         // Expected Results Compare
4735         //
4736         UnicodeString expectedS(fields[4]);
4737         expectedS.findAndReplace(nulnulSrc, nulnul);
4738         expectedS.findAndReplace(ffffSrc,   ffff);
4739         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4740
4741
4742         if (expectedS.compare(resultString) != 0) {
4743             err("Line %d: Incorrect perl expression results.", lineNum);
4744             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4745         }
4746
4747         delete testMat;
4748         delete testPat;
4749     }
4750
4751     //
4752     // All done.  Clean up allocated stuff.
4753     //
4754     delete cgMat;
4755     delete cgPat;
4756
4757     delete groupsMat;
4758     delete groupsPat;
4759
4760     delete flagMat;
4761     delete flagPat;
4762
4763     delete lineMat;
4764     delete linePat;
4765
4766     delete fieldPat;
4767     delete [] testData;
4768
4769     utext_close(&patternText);
4770     utext_close(&inputText);
4771
4772     delete [] patternChars;
4773     delete [] inputChars;
4774
4775
4776     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4777
4778 }
4779
4780
4781 //--------------------------------------------------------------
4782 //
4783 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4784 //             Use this pattern,
4785 //                 "(a?){1,8000000}"
4786 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4787 //                   This test is likely to be fragile, as further optimizations stop
4788 //                   more cases of pointless looping in the match engine.
4789 //
4790 //---------------------------------------------------------------
4791 void RegexTest::Bug6149() {
4792     UnicodeString pattern("(a?){1,8000000}");
4793     UnicodeString s("xyz");
4794     uint32_t flags = 0;
4795     UErrorCode status = U_ZERO_ERROR;
4796
4797     RegexMatcher  matcher(pattern, s, flags, status);
4798     UBool result = false;
4799     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4800     REGEX_ASSERT(result == FALSE);
4801  }
4802
4803
4804 //
4805 //   Callbacks()    Test the callback function.
4806 //                  When set, callbacks occur periodically during matching operations,
4807 //                  giving the application code the ability to abort the operation
4808 //                  before it's normal completion.
4809 //
4810
4811 struct callBackContext {
4812     RegexTest        *test;
4813     int32_t          maxCalls;
4814     int32_t          numCalls;
4815     int32_t          lastSteps;
4816     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4817 };
4818
4819 U_CDECL_BEGIN
4820 static UBool U_CALLCONV
4821 testCallBackFn(const void *context, int32_t steps) {
4822     callBackContext  *info = (callBackContext *)context;
4823     if (info->lastSteps+1 != steps) {
4824         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4825     }
4826     info->lastSteps = steps;
4827     info->numCalls++;
4828     return (info->numCalls < info->maxCalls);
4829 }
4830 U_CDECL_END
4831
4832 void RegexTest::Callbacks() {
4833    {
4834         // Getter returns NULLs if no callback has been set
4835
4836         //   The variables that the getter will fill in.
4837         //   Init to non-null values so that the action of the getter can be seen.
4838         const void          *returnedContext = &returnedContext;
4839         URegexMatchCallback *returnedFn = &testCallBackFn;
4840
4841         UErrorCode status = U_ZERO_ERROR;
4842         RegexMatcher matcher("x", 0, status);
4843         REGEX_CHECK_STATUS;
4844         matcher.getMatchCallback(returnedFn, returnedContext, status);
4845         REGEX_CHECK_STATUS;
4846         REGEX_ASSERT(returnedFn == NULL);
4847         REGEX_ASSERT(returnedContext == NULL);
4848     }
4849
4850    {
4851         // Set and Get work
4852         callBackContext cbInfo = {this, 0, 0, 0};
4853         const void          *returnedContext;
4854         URegexMatchCallback *returnedFn;
4855         UErrorCode status = U_ZERO_ERROR;
4856         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4857         REGEX_CHECK_STATUS;
4858         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4859         REGEX_CHECK_STATUS;
4860         matcher.getMatchCallback(returnedFn, returnedContext, status);
4861         REGEX_CHECK_STATUS;
4862         REGEX_ASSERT(returnedFn == testCallBackFn);
4863         REGEX_ASSERT(returnedContext == &cbInfo);
4864
4865         // A short-running match shouldn't invoke the callback
4866         status = U_ZERO_ERROR;
4867         cbInfo.reset(1);
4868         UnicodeString s = "xxx";
4869         matcher.reset(s);
4870         REGEX_ASSERT(matcher.matches(status));
4871         REGEX_CHECK_STATUS;
4872         REGEX_ASSERT(cbInfo.numCalls == 0);
4873
4874         // A medium-length match that runs long enough to invoke the
4875         //   callback, but not so long that the callback aborts it.
4876         status = U_ZERO_ERROR;
4877         cbInfo.reset(4);
4878         s = "aaaaaaaaaaaaaaaaaaab";
4879         matcher.reset(s);
4880         REGEX_ASSERT(matcher.matches(status)==FALSE);
4881         REGEX_CHECK_STATUS;
4882         REGEX_ASSERT(cbInfo.numCalls > 0);
4883
4884         // A longer running match that the callback function will abort.
4885         status = U_ZERO_ERROR;
4886         cbInfo.reset(4);
4887         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4888         matcher.reset(s);
4889         REGEX_ASSERT(matcher.matches(status)==FALSE);
4890         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4891         REGEX_ASSERT(cbInfo.numCalls == 4);
4892
4893         // A longer running find that the callback function will abort.
4894         status = U_ZERO_ERROR;
4895         cbInfo.reset(4);
4896         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4897         matcher.reset(s);
4898         REGEX_ASSERT(matcher.find(status)==FALSE);
4899         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4900         REGEX_ASSERT(cbInfo.numCalls == 4);
4901     }
4902
4903
4904 }
4905
4906
4907 //
4908 //   FindProgressCallbacks()    Test the find "progress" callback function.
4909 //                  When set, the find progress callback will be invoked during a find operations
4910 //                  after each return from a match attempt, giving the application the opportunity
4911 //                  to terminate a long-running find operation before it's normal completion.
4912 //
4913
4914 struct progressCallBackContext {
4915     RegexTest        *test;
4916     int64_t          lastIndex;
4917     int32_t          maxCalls;
4918     int32_t          numCalls;
4919     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4920 };
4921
4922 // call-back function for find().
4923 // Return TRUE to continue the find().
4924 // Return FALSE to stop the find().
4925 U_CDECL_BEGIN
4926 static UBool U_CALLCONV
4927 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4928     progressCallBackContext  *info = (progressCallBackContext *)context;
4929     info->numCalls++;
4930     info->lastIndex = matchIndex;
4931 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4932     return (info->numCalls < info->maxCalls);
4933 }
4934 U_CDECL_END
4935
4936 void RegexTest::FindProgressCallbacks() {
4937    {
4938         // Getter returns NULLs if no callback has been set
4939
4940         //   The variables that the getter will fill in.
4941         //   Init to non-null values so that the action of the getter can be seen.
4942         const void                  *returnedContext = &returnedContext;
4943         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4944
4945         UErrorCode status = U_ZERO_ERROR;
4946         RegexMatcher matcher("x", 0, status);
4947         REGEX_CHECK_STATUS;
4948         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4949         REGEX_CHECK_STATUS;
4950         REGEX_ASSERT(returnedFn == NULL);
4951         REGEX_ASSERT(returnedContext == NULL);
4952     }
4953
4954    {
4955         // Set and Get work
4956         progressCallBackContext cbInfo = {this, 0, 0, 0};
4957         const void                  *returnedContext;
4958         URegexFindProgressCallback  *returnedFn;
4959         UErrorCode status = U_ZERO_ERROR;
4960         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4961         REGEX_CHECK_STATUS;
4962         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4963         REGEX_CHECK_STATUS;
4964         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4965         REGEX_CHECK_STATUS;
4966         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4967         REGEX_ASSERT(returnedContext == &cbInfo);
4968
4969         // A find that matches on the initial position does NOT invoke the callback.
4970         status = U_ZERO_ERROR;
4971         cbInfo.reset(100);
4972         UnicodeString s = "aaxxx";
4973         matcher.reset(s);
4974 #if 0
4975         matcher.setTrace(TRUE);
4976 #endif
4977         REGEX_ASSERT(matcher.find(0, status));
4978         REGEX_CHECK_STATUS;
4979         REGEX_ASSERT(cbInfo.numCalls == 0);
4980
4981         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4982         //   but not so many times that we interrupt the operation.
4983         status = U_ZERO_ERROR;
4984         s = "aaaaaaaaaaaaaaaaaaab";
4985         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4986         matcher.reset(s);
4987         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4988         REGEX_CHECK_STATUS;
4989         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4990
4991         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4992         status = U_ZERO_ERROR;
4993         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4994         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4995         matcher.reset(s1);
4996         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4997         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4998         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4999
5000         // Now a match that will succeed, but after an interruption
5001         status = U_ZERO_ERROR;
5002         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5003         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
5004         matcher.reset(s2);
5005         REGEX_ASSERT(matcher.find(0, status)==FALSE);
5006         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5007         // Now retry the match from where left off
5008         cbInfo.maxCalls = 100; //  No callback limit
5009         status = U_ZERO_ERROR;
5010         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5011         REGEX_CHECK_STATUS;
5012     }
5013
5014
5015 }
5016
5017
5018 //---------------------------------------------------------------------------
5019 //
5020 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5021 //                             UTexts. The pure-C implementation of UText
5022 //                             has no mutable backing stores, but we can
5023 //                             use UnicodeString here to test the functionality.
5024 //
5025 //---------------------------------------------------------------------------
5026 void RegexTest::PreAllocatedUTextCAPI () {
5027     UErrorCode           status = U_ZERO_ERROR;
5028     URegularExpression  *re;
5029     UText                patternText = UTEXT_INITIALIZER;
5030     UnicodeString        buffer;
5031     UText                bufferText = UTEXT_INITIALIZER;
5032
5033     utext_openUnicodeString(&bufferText, &buffer, &status);
5034
5035     /*
5036      *  getText() and getUText()
5037      */
5038     {
5039         UText  text1 = UTEXT_INITIALIZER;
5040         UText  text2 = UTEXT_INITIALIZER;
5041         UChar  text2Chars[20];
5042         UText  *resultText;
5043
5044         status = U_ZERO_ERROR;
5045         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5046         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5047         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5048         utext_openUChars(&text2, text2Chars, -1, &status);
5049
5050         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5051         re = uregex_openUText(&patternText, 0, NULL, &status);
5052
5053         /* First set a UText */
5054         uregex_setUText(re, &text1, &status);
5055         resultText = uregex_getUText(re, &bufferText, &status);
5056         REGEX_CHECK_STATUS;
5057         REGEX_ASSERT(resultText == &bufferText);
5058         utext_setNativeIndex(resultText, 0);
5059         utext_setNativeIndex(&text1, 0);
5060         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5061
5062         resultText = uregex_getUText(re, &bufferText, &status);
5063         REGEX_CHECK_STATUS;
5064         REGEX_ASSERT(resultText == &bufferText);
5065         utext_setNativeIndex(resultText, 0);
5066         utext_setNativeIndex(&text1, 0);
5067         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5068
5069         /* Then set a UChar * */
5070         uregex_setText(re, text2Chars, 7, &status);
5071         resultText = uregex_getUText(re, &bufferText, &status);
5072         REGEX_CHECK_STATUS;
5073         REGEX_ASSERT(resultText == &bufferText);
5074         utext_setNativeIndex(resultText, 0);
5075         utext_setNativeIndex(&text2, 0);
5076         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5077
5078         uregex_close(re);
5079         utext_close(&text1);
5080         utext_close(&text2);
5081     }
5082
5083     /*
5084      *  group()
5085      */
5086     {
5087         UChar    text1[80];
5088         UText   *actual;
5089         UBool    result;
5090         int64_t  length = 0;
5091
5092         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5093         //                  012345678901234567890123456789012345678901234567
5094         //                  0         1         2         3         4
5095
5096         status = U_ZERO_ERROR;
5097         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5098         REGEX_CHECK_STATUS;
5099
5100         uregex_setText(re, text1, -1, &status);
5101         result = uregex_find(re, 0, &status);
5102         REGEX_ASSERT(result==TRUE);
5103
5104         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5105         status = U_ZERO_ERROR;
5106         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5107         REGEX_CHECK_STATUS;
5108         REGEX_ASSERT(actual == &bufferText);
5109         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5110         REGEX_ASSERT(length == 16);
5111         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5112
5113         /*  Capture group #1.  Should succeed, matching " interior ". */
5114         status = U_ZERO_ERROR;
5115         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5116         REGEX_CHECK_STATUS;
5117         REGEX_ASSERT(actual == &bufferText);
5118         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5119         REGEX_ASSERT(length == 10);
5120         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5121
5122         /*  Capture group out of range.  Error. */
5123         status = U_ZERO_ERROR;
5124         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5125         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5126         REGEX_ASSERT(actual == &bufferText);
5127         uregex_close(re);
5128
5129     }
5130
5131     /*
5132      *  replaceFirst()
5133      */
5134     {
5135         UChar    text1[80];
5136         UChar    text2[80];
5137         UText    replText = UTEXT_INITIALIZER;
5138         UText   *result;
5139         status = U_ZERO_ERROR;
5140         utext_openUnicodeString(&bufferText, &buffer, &status);
5141
5142         status = U_ZERO_ERROR;
5143         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5144         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5145         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5146
5147         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5148         REGEX_CHECK_STATUS;
5149
5150         /*  Normal case, with match */
5151         uregex_setText(re, text1, -1, &status);
5152         REGEX_CHECK_STATUS;
5153         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5154         REGEX_CHECK_STATUS;
5155         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5156         REGEX_CHECK_STATUS;
5157         REGEX_ASSERT(result == &bufferText);
5158         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5159
5160         /* No match.  Text should copy to output with no changes.  */
5161         uregex_setText(re, text2, -1, &status);
5162         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5163         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5164         REGEX_CHECK_STATUS;
5165         REGEX_ASSERT(result == &bufferText);
5166         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5167
5168         /* Unicode escapes */
5169         uregex_setText(re, text1, -1, &status);
5170         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5171         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5172         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5173         REGEX_CHECK_STATUS;
5174         REGEX_ASSERT(result == &bufferText);
5175         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5176
5177         uregex_close(re);
5178         utext_close(&replText);
5179     }
5180
5181
5182     /*
5183      *  replaceAll()
5184      */
5185     {
5186         UChar    text1[80];
5187         UChar    text2[80];
5188         UText    replText = UTEXT_INITIALIZER;
5189         UText   *result;
5190
5191         status = U_ZERO_ERROR;
5192         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5193         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5194         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5195
5196         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5197         REGEX_CHECK_STATUS;
5198
5199         /*  Normal case, with match */
5200         uregex_setText(re, text1, -1, &status);
5201         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5202         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5203         REGEX_CHECK_STATUS;
5204         REGEX_ASSERT(result == &bufferText);
5205         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5206
5207         /* No match.  Text should copy to output with no changes.  */
5208         uregex_setText(re, text2, -1, &status);
5209         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5210         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5211         REGEX_CHECK_STATUS;
5212         REGEX_ASSERT(result == &bufferText);
5213         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5214
5215         uregex_close(re);
5216         utext_close(&replText);
5217     }
5218
5219
5220     /*
5221      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5222      *   so we don't need to test it here.
5223      */
5224
5225     utext_close(&bufferText);
5226     utext_close(&patternText);
5227 }
5228
5229
5230 //--------------------------------------------------------------
5231 //
5232 //  NamedCapture   Check basic named capture group functionality
5233 //
5234 //--------------------------------------------------------------
5235 void RegexTest::NamedCapture() {
5236     UErrorCode status = U_ZERO_ERROR;
5237     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5238             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5239     REGEX_CHECK_STATUS;
5240     int32_t group = pat->groupNumberFromName("five", -1, status);
5241     REGEX_CHECK_STATUS;
5242     REGEX_ASSERT(5 == group);
5243     group = pat->groupNumberFromName("three", -1, status);
5244     REGEX_CHECK_STATUS;
5245     REGEX_ASSERT(3 == group);
5246
5247     status = U_ZERO_ERROR;
5248     group = pat->groupNumberFromName(UnicodeString("six"), status);
5249     REGEX_CHECK_STATUS;
5250     REGEX_ASSERT(6 == group);
5251
5252     status = U_ZERO_ERROR;
5253     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5254     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5255
5256     status = U_ZERO_ERROR;
5257
5258     // After copying a pattern, named capture should still work in the copy.
5259     RegexPattern *copiedPat = new RegexPattern(*pat);
5260     REGEX_ASSERT(*copiedPat == *pat);
5261     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5262
5263     group = copiedPat->groupNumberFromName("five", -1, status);
5264     REGEX_CHECK_STATUS;
5265     REGEX_ASSERT(5 == group);
5266     group = copiedPat->groupNumberFromName("three", -1, status);
5267     REGEX_CHECK_STATUS;
5268     REGEX_ASSERT(3 == group);
5269     delete copiedPat;
5270
5271     // ReplaceAll with named capture group.
5272     status = U_ZERO_ERROR;
5273     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5274     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5275     REGEX_CHECK_STATUS;
5276     // m.pattern().dumpPattern();
5277     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5278     REGEX_CHECK_STATUS;
5279     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5280     delete m;
5281
5282     // ReplaceAll, allowed capture group numbers.
5283     text = UnicodeString("abcmxyz");
5284     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5285     REGEX_CHECK_STATUS;
5286
5287     status = U_ZERO_ERROR;
5288     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5289     REGEX_CHECK_STATUS;
5290     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5291
5292     status = U_ZERO_ERROR;
5293     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5294     REGEX_CHECK_STATUS;
5295     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5296
5297     status = U_ZERO_ERROR;
5298     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5299     REGEX_CHECK_STATUS;
5300     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5301
5302     status = U_ZERO_ERROR;
5303     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5304     REGEX_CHECK_STATUS;
5305     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5306
5307     status = U_ZERO_ERROR;
5308     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5309     REGEX_CHECK_STATUS;
5310     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5311
5312     status = U_ZERO_ERROR;
5313     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5314     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5315
5316     status = U_ZERO_ERROR;
5317     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5318     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5319     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5320
5321     status = U_ZERO_ERROR;
5322     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5323     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5324     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5325
5326     status = U_ZERO_ERROR;
5327     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5328     REGEX_CHECK_STATUS;
5329     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5330
5331     status = U_ZERO_ERROR;
5332     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5333     REGEX_CHECK_STATUS;
5334     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5335
5336     status = U_ZERO_ERROR;
5337     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5338     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5339
5340     status = U_ZERO_ERROR;
5341     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5342     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5343
5344     status = U_ZERO_ERROR;
5345     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5346     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5347
5348     status = U_ZERO_ERROR;
5349     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5350     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5351
5352     delete m;
5353
5354     // Repeat the above replaceAll() tests using the plain C API, which
5355     //  has a separate implementation internally.
5356     //  TODO: factor out the test data.
5357
5358     status = U_ZERO_ERROR;
5359     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5360     REGEX_CHECK_STATUS;
5361     text = UnicodeString("abcmxyz");
5362     uregex_setText(re, text.getBuffer(), text.length(), &status);
5363     REGEX_CHECK_STATUS;
5364
5365     UChar resultBuf[100];
5366     int32_t resultLength;
5367     UnicodeString repl;
5368
5369     status = U_ZERO_ERROR;
5370     repl = UnicodeString("<$0>");
5371     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5372     REGEX_CHECK_STATUS;
5373     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5374
5375     status = U_ZERO_ERROR;
5376     repl = UnicodeString("<$1>");
5377     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5378     REGEX_CHECK_STATUS;
5379     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5380
5381     status = U_ZERO_ERROR;
5382     repl = UnicodeString("<${one}>");
5383     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5384     REGEX_CHECK_STATUS;
5385     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5386
5387     status = U_ZERO_ERROR;
5388     repl = UnicodeString("<$2>");
5389     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5390     REGEX_CHECK_STATUS;
5391     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5392
5393     status = U_ZERO_ERROR;
5394     repl = UnicodeString("<$3>");
5395     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5396     REGEX_CHECK_STATUS;
5397     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5398
5399     status = U_ZERO_ERROR;
5400     repl = UnicodeString("<$4>");
5401     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5402     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5403
5404     status = U_ZERO_ERROR;
5405     repl = UnicodeString("<$04>");
5406     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5407     REGEX_CHECK_STATUS;
5408     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5409
5410     status = U_ZERO_ERROR;
5411     repl = UnicodeString("<$000016>");
5412     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5413     REGEX_CHECK_STATUS;
5414     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5415
5416     status = U_ZERO_ERROR;
5417     repl = UnicodeString("<$3$2$1${one}>");
5418     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5419     REGEX_CHECK_STATUS;
5420     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5421
5422     status = U_ZERO_ERROR;
5423     repl = UnicodeString("$3$2$1${one}");
5424     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5425     REGEX_CHECK_STATUS;
5426     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5427
5428     status = U_ZERO_ERROR;
5429     repl = UnicodeString("<${noSuchName}>");
5430     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5431     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5432
5433     status = U_ZERO_ERROR;
5434     repl = UnicodeString("<${invalid-name}>");
5435     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5436     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5437
5438     status = U_ZERO_ERROR;
5439     repl = UnicodeString("<${one");
5440     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5441     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5442
5443     status = U_ZERO_ERROR;
5444     repl = UnicodeString("$not a capture group");
5445     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5446     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5447
5448     uregex_close(re);
5449 }
5450
5451 //--------------------------------------------------------------
5452 //
5453 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5454 //                       The point is not so much what the exact limit is,
5455 //                       but that a largish number doesn't hit bad non-linear performance,
5456 //                       and that exceeding the limit fails cleanly.
5457 //
5458 //--------------------------------------------------------------
5459 void RegexTest::NamedCaptureLimits() {
5460     if (quick) {
5461         logln("Skipping test. Runs in exhuastive mode only.");
5462         return;
5463     }
5464     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5465     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5466     char nnbuf[100];
5467     UnicodeString pattern;
5468     int32_t nn;
5469
5470     for (nn=1; nn<goodLimit; nn++) {
5471         sprintf(nnbuf, "(?<nn%d>)", nn);
5472         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5473     }
5474     UErrorCode status = U_ZERO_ERROR;
5475     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5476     REGEX_CHECK_STATUS;
5477     for (nn=1; nn<goodLimit; nn++) {
5478         sprintf(nnbuf, "nn%d", nn);
5479         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5480         REGEX_ASSERT(nn == groupNum);
5481         if (nn != groupNum) {
5482             break;
5483         }
5484     }
5485     delete pat;
5486
5487     pattern.remove();
5488     for (nn=1; nn<failLimit; nn++) {
5489         sprintf(nnbuf, "(?<nn%d>)", nn);
5490         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5491     }
5492     status = U_ZERO_ERROR;
5493     pat = RegexPattern::compile(pattern, 0, status);
5494     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5495     delete pat;
5496 }
5497
5498
5499 //--------------------------------------------------------------
5500 //
5501 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5502 //
5503 //---------------------------------------------------------------
5504 void RegexTest::Bug7651() {
5505     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5506     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5507     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5508     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5509     UnicodeString s("#ff @abcd This is test");
5510     RegexPattern  *REPattern = NULL;
5511     RegexMatcher  *REMatcher = NULL;
5512     UErrorCode status = U_ZERO_ERROR;
5513     UParseError pe;
5514
5515     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5516     REGEX_CHECK_STATUS;
5517     REMatcher = REPattern->matcher(s, status);
5518     REGEX_CHECK_STATUS;
5519     REGEX_ASSERT(REMatcher->find());
5520     REGEX_ASSERT(REMatcher->start(status) == 0);
5521     delete REPattern;
5522     delete REMatcher;
5523     status = U_ZERO_ERROR;
5524
5525     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5526     REGEX_CHECK_STATUS;
5527     REMatcher = REPattern->matcher(s, status);
5528     REGEX_CHECK_STATUS;
5529     REGEX_ASSERT(REMatcher->find());
5530     REGEX_ASSERT(REMatcher->start(status) == 0);
5531     delete REPattern;
5532     delete REMatcher;
5533     status = U_ZERO_ERROR;
5534  }
5535
5536 void RegexTest::Bug7740() {
5537     UErrorCode status = U_ZERO_ERROR;
5538     UnicodeString pattern = "(a)";
5539     UnicodeString text = "abcdef";
5540     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5541     REGEX_CHECK_STATUS;
5542     REGEX_ASSERT(m->lookingAt(status));
5543     REGEX_CHECK_STATUS;
5544     status = U_ILLEGAL_ARGUMENT_ERROR;
5545     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5546     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5547     REGEX_ASSERT(s == "");
5548     delete m;
5549 }
5550
5551 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5552
5553 void RegexTest::Bug8479() {
5554     UErrorCode status = U_ZERO_ERROR;
5555
5556     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5557     REGEX_CHECK_STATUS;
5558     if (U_SUCCESS(status))
5559     {
5560         UnicodeString str;
5561         str.setToBogus();
5562         pMatcher->reset(str);
5563         status = U_ZERO_ERROR;
5564         pMatcher->matches(status);
5565         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5566         delete pMatcher;
5567     }
5568 }
5569
5570
5571 // Bug 7029
5572 void RegexTest::Bug7029() {
5573     UErrorCode status = U_ZERO_ERROR;
5574
5575     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5576     UnicodeString text = "abc.def";
5577     UnicodeString splits[10];
5578     REGEX_CHECK_STATUS;
5579     int32_t numFields = pMatcher->split(text, splits, 10, status);
5580     REGEX_CHECK_STATUS;
5581     REGEX_ASSERT(numFields == 8);
5582     delete pMatcher;
5583 }
5584
5585 // Bug 9283
5586 //   This test is checking for the existance of any supplemental characters that case-fold
5587 //   to a bmp character.
5588 //
5589 //   At the time of this writing there are none. If any should appear in a subsequent release
5590 //   of Unicode, the code in regular expressions compilation that determines the longest
5591 //   posssible match for a literal string  will need to be enhanced.
5592 //
5593 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5594 //   for details on what to do in case of a failure of this test.
5595 //
5596 void RegexTest::Bug9283() {
5597 #if !UCONFIG_NO_NORMALIZATION
5598     UErrorCode status = U_ZERO_ERROR;
5599     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5600     REGEX_CHECK_STATUS;
5601     int32_t index;
5602     UChar32 c;
5603     for (index=0; ; index++) {
5604         c = supplementalsWithCaseFolding.charAt(index);
5605         if (c == -1) {
5606             break;
5607         }
5608         UnicodeString cf = UnicodeString(c).foldCase();
5609         REGEX_ASSERT(cf.length() >= 2);
5610     }
5611 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5612 }
5613
5614
5615 void RegexTest::CheckInvBufSize() {
5616   if(inv_next>=INV_BUFSIZ) {
5617     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5618           __FILE__, INV_BUFSIZ, inv_next);
5619   } else {
5620     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5621   }
5622 }
5623
5624
5625 void RegexTest::Bug10459() {
5626     UErrorCode status = U_ZERO_ERROR;
5627     UnicodeString patternString("(txt)");
5628     UnicodeString txtString("txt");
5629
5630     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5631     REGEX_CHECK_STATUS;
5632     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5633     REGEX_CHECK_STATUS;
5634
5635     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5636     REGEX_CHECK_STATUS;
5637
5638     uregex_setUText(icu_re, utext_txt, &status);
5639     REGEX_CHECK_STATUS;
5640
5641     // The bug was that calling uregex_group() before doing a matching operation
5642     //   was causing a segfault. Only for Regular Expressions created from UText.
5643     //   It should set an U_REGEX_INVALID_STATE.
5644
5645     UChar buf[100];
5646     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5647     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5648     REGEX_ASSERT(len == 0);
5649
5650     uregex_close(icu_re);
5651     utext_close(utext_pat);
5652     utext_close(utext_txt);
5653 }
5654
5655 void RegexTest::TestCaseInsensitiveStarters() {
5656     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5657     //  become stale because of new Unicode characters.
5658     // If it is stale, rerun the generation tool
5659     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5660     // and replace the embedded data in i18n/regexcmp.cpp
5661
5662     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5663         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5664             continue;
5665         }
5666         UnicodeSet s(cp, cp);
5667         s.closeOver(USET_CASE_INSENSITIVE);
5668         UnicodeSetIterator setIter(s);
5669         while (setIter.next()) {
5670             if (!setIter.isString()) {
5671                 continue;
5672             }
5673             const UnicodeString &str = setIter.getString();
5674             UChar32 firstChar = str.char32At(0);
5675             UnicodeSet starters;
5676             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5677             if (!starters.contains(cp)) {
5678                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5679                 return;
5680             }
5681         }
5682     }
5683 }
5684
5685
5686 void RegexTest::TestBug11049() {
5687     // Original bug report: pattern with match start consisting of one of several individual characters,
5688     //  and the text being matched ending with a supplementary character. find() would read past the
5689     //  end of the input text when searching for potential match starting points.
5690
5691     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5692     // detect the bad read.
5693
5694     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5695     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5696
5697     // Test again with a pattern starting with a single character,
5698     // which takes a different code path than starting with an OR expression,
5699     // but with similar logic.
5700     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5701     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5702 }
5703
5704 // Run a single test case from TestBug11049(). Internal function.
5705 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5706     UErrorCode status = U_ZERO_ERROR;
5707     UnicodeString patternString = UnicodeString(pattern).unescape();
5708     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5709
5710     UnicodeString dataString = UnicodeString(data).unescape();
5711     UChar *exactBuffer = new UChar[dataString.length()];
5712     dataString.extract(exactBuffer, dataString.length(), status);
5713     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5714
5715     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5716     REGEX_CHECK_STATUS;
5717     matcher->reset(ut);
5718     UBool result = matcher->find();
5719     if (result != expectMatch) {
5720         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5721               __FILE__, lineNumber, expectMatch, result, pattern, data);
5722     }
5723
5724     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5725     //   off-by-one on find() with match at the last code point.
5726     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5727     //   because string.unescape() will only shrink it.
5728     char * utf8Buffer = new char[uprv_strlen(data)+1];
5729     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5730     REGEX_CHECK_STATUS;
5731     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5732     REGEX_CHECK_STATUS;
5733     matcher->reset(ut);
5734     result = matcher->find();
5735     if (result != expectMatch) {
5736         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5737               __FILE__, lineNumber, expectMatch, result, pattern, data);
5738     }
5739     delete [] utf8Buffer;
5740
5741     utext_close(ut);
5742     delete [] exactBuffer;
5743 }
5744
5745
5746 void RegexTest::TestBug11371() {
5747     if (quick) {
5748         logln("Skipping test. Runs in exhuastive mode only.");
5749         return;
5750     }
5751     UErrorCode status = U_ZERO_ERROR;
5752     UnicodeString patternString;
5753
5754     for (int i=0; i<8000000; i++) {
5755         patternString.append(UnicodeString("()"));
5756     }
5757     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5758     if (status != U_REGEX_PATTERN_TOO_BIG) {
5759         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5760               __FILE__, __LINE__, u_errorName(status));
5761     }
5762
5763     status = U_ZERO_ERROR;
5764     patternString = "(";
5765     for (int i=0; i<20000000; i++) {
5766         patternString.append(UnicodeString("A++"));
5767     }
5768     patternString.append(UnicodeString("){0}B++"));
5769     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5770     if (status != U_REGEX_PATTERN_TOO_BIG) {
5771         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5772               __FILE__, __LINE__, u_errorName(status));
5773     }
5774
5775     // Pattern with too much string data, such that string indexes overflow operand data field size
5776     // in compiled instruction.
5777     status = U_ZERO_ERROR;
5778     patternString = "";
5779     while (patternString.length() < 0x00ffffff) {
5780         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5781     }
5782     patternString.append(UnicodeString("X? trailing string"));
5783     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5784     if (status != U_REGEX_PATTERN_TOO_BIG) {
5785         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5786               __FILE__, __LINE__, u_errorName(status));
5787     }
5788 }
5789
5790 void RegexTest::TestBug11480() {
5791     // C API, get capture group of a group that does not participate in the match.
5792     //        (Returns a zero length string, with nul termination,
5793     //         indistinguishable from a group with a zero length match.)
5794
5795     UErrorCode status = U_ZERO_ERROR;
5796     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5797     REGEX_CHECK_STATUS;
5798     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5799     uregex_setText(re, text.getBuffer(), text.length(), &status);
5800     REGEX_CHECK_STATUS;
5801     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5802     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5803     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5804     REGEX_ASSERT(length == 0);
5805     REGEX_ASSERT(buf[0] == 13);
5806     REGEX_ASSERT(buf[1] == 0);
5807     REGEX_ASSERT(buf[2] == 13);
5808     uregex_close(re);
5809
5810     // UText C++ API, length of match is 0 for non-participating matches.
5811     UText ut = UTEXT_INITIALIZER;
5812     utext_openUnicodeString(&ut, &text, &status);
5813     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5814     REGEX_CHECK_STATUS;
5815     matcher.reset(&ut);
5816     REGEX_ASSERT(matcher.lookingAt(0, status));
5817
5818     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5819     int64_t groupLen = -666;
5820     UText group = UTEXT_INITIALIZER;
5821     matcher.group(1, &group, groupLen, status);
5822     REGEX_CHECK_STATUS;
5823     REGEX_ASSERT(groupLen == 1);
5824     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5825
5826     // Capture group 2, the (B), does not participate in the match.
5827     matcher.group(2, &group, groupLen, status);
5828     REGEX_CHECK_STATUS;
5829     REGEX_ASSERT(groupLen == 0);
5830     REGEX_ASSERT(matcher.start(2, status) == -1);
5831     REGEX_CHECK_STATUS;
5832 }
5833
5834
5835 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */