icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2012, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 /*
  14      NOTE!!
  15
  16      PLEASE be careful about ASCII assumptions in this test.
  17      This test is one of the worst repeat offenders.
  18      If you have questions, contact someone on the ICU PMC
  19      who has access to an EBCDIC system.
  20
  21  */
  22
  23 #include "intltest.h"
  24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  25
  26 #include "unicode/regex.h"
  27 #include "unicode/uchar.h"
  28 #include "unicode/ucnv.h"
  29 #include "unicode/uniset.h"
  30 #include "unicode/ustring.h"
  31 #include "regextst.h"
  32 #include "uvector.h"
  33 #include "util.h"
  34 #include <stdlib.h>
  35 #include <string.h>
  36 #include <stdio.h>
  37 #include "cstring.h"
  38 #include "uinvchar.h"
  39
  40 #define SUPPORT_MUTATING_INPUT_STRING   0
  41
  42 //---------------------------------------------------------------------------
  43 //
  44 //  Test class boilerplate
  45 //
  46 //---------------------------------------------------------------------------
  47 RegexTest::RegexTest()
  48 {
  49 }
  50
  51
  52 RegexTest::~RegexTest()
  53 {
  54 }
  55
  56
  57
  58 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  59 {
  60     if (exec) logln("TestSuite RegexTest: ");
  61     switch (index) {
  62
  63         case 0: name = "Basic";
  64             if (exec) Basic();
  65             break;
  66         case 1: name = "API_Match";
  67             if (exec) API_Match();
  68             break;
  69         case 2: name = "API_Replace";
  70             if (exec) API_Replace();
  71             break;
  72         case 3: name = "API_Pattern";
  73             if (exec) API_Pattern();
  74             break;
  75         case 4:
  76 #if !UCONFIG_NO_FILE_IO
  77             name = "Extended";
  78             if (exec) Extended();
  79 #else
  80             name = "skip";
  81 #endif
  82             break;
  83         case 5: name = "Errors";
  84             if (exec) Errors();
  85             break;
  86         case 6: name = "PerlTests";
  87             if (exec) PerlTests();
  88             break;
  89         case 7: name = "Callbacks";
  90             if (exec) Callbacks();
  91             break;
  92         case 8: name = "FindProgressCallbacks";
  93             if (exec) FindProgressCallbacks();
  94             break;
  95         case 9: name = "Bug 6149";
  96              if (exec) Bug6149();
  97              break;
  98         case 10: name = "UTextBasic";
  99           if (exec) UTextBasic();
 100           break;
 101         case 11: name = "API_Match_UTF8";
 102           if (exec) API_Match_UTF8();
 103           break;
 104         case 12: name = "API_Replace_UTF8";
 105           if (exec) API_Replace_UTF8();
 106           break;
 107         case 13: name = "API_Pattern_UTF8";
 108           if (exec) API_Pattern_UTF8();
 109           break;
 110         case 14: name = "PerlTestsUTF8";
 111           if (exec) PerlTestsUTF8();
 112           break;
 113         case 15: name = "PreAllocatedUTextCAPI";
 114           if (exec) PreAllocatedUTextCAPI();
 115           break;
 116         case 16: name = "Bug 7651";
 117              if (exec) Bug7651();
 118              break;
 119         case 17: name = "Bug 7740";
 120             if (exec) Bug7740();
 121             break;
 122         case 18: name = "Bug 8479";
 123             if (exec) Bug8479();
 124             break;
 125         case 19: name = "Bug 7029";
 126             if (exec) Bug7029();
 127             break;
 128         case 20: name = "CheckInvBufSize";
 129             if (exec) CheckInvBufSize();
 130             break;
 131         case 21: name = "Bug 9283";
 132             if (exec) Bug9283();
 133             break;
 134
 135         default: name = "";
 136             break; //needed to end loop
 137     }
 138 }
 139
 140
 141
 142 /**
 143  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
 144  * into ASCII.
 145  * @see utext_openUTF8
 146  */
 147 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
 148
 149 //---------------------------------------------------------------------------
 150 //
 151 //   Error Checking / Reporting macros used in all of the tests.
 152 //
 153 //---------------------------------------------------------------------------
 154
 155 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
 156   int64_t oldIndex = utext_getNativeIndex(text);
 157   utext_setNativeIndex(text, 0);
 158   char *bufPtr = buf;
 159   UChar32 c = utext_next32From(text, 0);
 160   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
 161     if (0x000020<=c && c<0x00007e) {
 162       *bufPtr = c;
 163     } else {
 164 #if 0
 165       sprintf(bufPtr,"U+%04X", c);
 166       bufPtr+= strlen(bufPtr)-1;
 167 #else
 168       *bufPtr = '%';
 169 #endif
 170     }
 171     bufPtr++;
 172     c = UTEXT_NEXT32(text);
 173   }
 174   *bufPtr = 0;
 175 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
 176   char *ebuf = (char*)malloc(bufLen);
 177   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
 178   uprv_strncpy(buf, ebuf, bufLen);
 179   free((void*)ebuf);
 180 #endif
 181   utext_setNativeIndex(text, oldIndex);
 182 }
 183
 184
 185 static char ASSERT_BUF[1024];
 186
 187 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
 188   if(message.length()==0) {
 189     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
 190   } else {
 191     UnicodeString buf;
 192     IntlTest::prettify(message,buf);
 193     if(buf.length()==0) {
 194       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
 195     } else {
 196       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
 197       if(ASSERT_BUF[0]==0) {
 198         ASSERT_BUF[0]=0;
 199         for(int32_t i=0;i<buf.length();i++) {
 200           UChar ch = buf[i];
 201           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
 202         }
 203       }
 204     }
 205   }
 206   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
 207   return ASSERT_BUF;
 208 }
 209
 210
 211 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
 212
 213 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
 214                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
 215
 216 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
 217
 218 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
 219 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
 220     __LINE__, u_errorName(errcode), u_errorName(status));};}
 221
 222 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
 223     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
 224
 225 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
 226     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
 227
 228 #define REGEX_ASSERT_UNISTR(ustr,inv) {if (!(ustr==inv)) {errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s,%s) failed \n", __FILE__, __LINE__, extractToAssertBuf(ustr),inv);};}
 229
 230
 231 static UBool testUTextEqual(UText *uta, UText *utb) {
 232     UChar32 ca = 0;
 233     UChar32 cb = 0;
 234     utext_setNativeIndex(uta, 0);
 235     utext_setNativeIndex(utb, 0);
 236     do {
 237         ca = utext_next32(uta);
 238         cb = utext_next32(utb);
 239         if (ca != cb) {
 240             break;
 241         }
 242     } while (ca != U_SENTINEL);
 243     return ca == cb;
 244 }
 245
 246
 247 /**
 248  * @param expected expected text in UTF-8 (not platform) codepage
 249  */
 250 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
 251     UErrorCode status = U_ZERO_ERROR;
 252     UText expectedText = UTEXT_INITIALIZER;
 253     utext_openUTF8(&expectedText, expected, -1, &status);
 254     if(U_FAILURE(status)) {
 255       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 256       return;
 257     }
 258     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
 259       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
 260       return;
 261     }
 262     utext_setNativeIndex(actual, 0);
 263     if (!testUTextEqual(&expectedText, actual)) {
 264         char buf[201 /*21*/];
 265         char expectedBuf[201];
 266         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 267         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 268         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 269     }
 270     utext_close(&expectedText);
 271 }
 272 /**
 273  * @param expected invariant (platform local text) input
 274  */
 275
 276 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
 277     UErrorCode status = U_ZERO_ERROR;
 278     UText expectedText = UTEXT_INITIALIZER;
 279     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
 280     if(U_FAILURE(status)) {
 281       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
 282       return;
 283     }
 284     utext_setNativeIndex(actual, 0);
 285     if (!testUTextEqual(&expectedText, actual)) {
 286         char buf[201 /*21*/];
 287         char expectedBuf[201];
 288         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
 289         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
 290         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
 291     }
 292     utext_close(&expectedText);
 293 }
 294
 295 /**
 296  * Assumes utf-8 input
 297  */
 298 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
 299 /**
 300  * Assumes Invariant input
 301  */
 302 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
 303
 304 /**
 305  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
 306  * passed into utext_openUTF8. An error will be given if
 307  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
 308  */
 309
 310 #define INV_BUFSIZ 2048 /* increase this if too small */
 311
 312 static int64_t inv_next=0;
 313
 314 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
 315 static char inv_buf[INV_BUFSIZ];
 316 #endif
 317
 318 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
 319   if(length==-1) length=strlen(inv);
 320 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
 321   inv_next+=length;
 322   return utext_openUTF8(ut, inv, length, status);
 323 #else
 324   if(inv_next+length+1>INV_BUFSIZ) {
 325     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
 326             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
 327     *status = U_MEMORY_ALLOCATION_ERROR;
 328     return NULL;
 329   }
 330
 331   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
 332   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
 333   inv_next+=length;
 334
 335 #if 0
 336   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
 337 #endif
 338
 339   return utext_openUTF8(ut, (const char*)buf, length, status);
 340 #endif
 341 }
 342
 343
 344 //---------------------------------------------------------------------------
 345 //
 346 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 347 //                       for the LookingAt() and  Match() functions.
 348 //
 349 //       usage:
 350 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 351 //
 352 //          The expected results are UBool - TRUE or FALSE.
 353 //          The input text is unescaped.  The pattern is not.
 354 //
 355 //
 356 //---------------------------------------------------------------------------
 357
 358 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
 359
 360 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 361     const UnicodeString pattern(pat, -1, US_INV);
 362     const UnicodeString inputText(text, -1, US_INV);
 363     UErrorCode          status  = U_ZERO_ERROR;
 364     UParseError         pe;
 365     RegexPattern        *REPattern = NULL;
 366     RegexMatcher        *REMatcher = NULL;
 367     UBool               retVal     = TRUE;
 368
 369     UnicodeString patString(pat, -1, US_INV);
 370     REPattern = RegexPattern::compile(patString, 0, pe, status);
 371     if (U_FAILURE(status)) {
 372         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
 373             line, u_errorName(status));
 374         return FALSE;
 375     }
 376     if (line==376) { RegexPatternDump(REPattern);}
 377
 378     UnicodeString inputString(inputText);
 379     UnicodeString unEscapedInput = inputString.unescape();
 380     REMatcher = REPattern->matcher(unEscapedInput, status);
 381     if (U_FAILURE(status)) {
 382         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 383             line, u_errorName(status));
 384         return FALSE;
 385     }
 386
 387     UBool actualmatch;
 388     actualmatch = REMatcher->lookingAt(status);
 389     if (U_FAILURE(status)) {
 390         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 391             line, u_errorName(status));
 392         retVal =  FALSE;
 393     }
 394     if (actualmatch != looking) {
 395         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 396         retVal = FALSE;
 397     }
 398
 399     status = U_ZERO_ERROR;
 400     actualmatch = REMatcher->matches(status);
 401     if (U_FAILURE(status)) {
 402         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 403             line, u_errorName(status));
 404         retVal = FALSE;
 405     }
 406     if (actualmatch != match) {
 407         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 408         retVal = FALSE;
 409     }
 410
 411     if (retVal == FALSE) {
 412         RegexPatternDump(REPattern);
 413     }
 414
 415     delete REPattern;
 416     delete REMatcher;
 417     return retVal;
 418 }
 419
 420
 421 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 422     UText               pattern    = UTEXT_INITIALIZER;
 423     int32_t             inputUTF8Length;
 424     char                *textChars = NULL;
 425     UText               inputText  = UTEXT_INITIALIZER;
 426     UErrorCode          status     = U_ZERO_ERROR;
 427     UParseError         pe;
 428     RegexPattern        *REPattern = NULL;
 429     RegexMatcher        *REMatcher = NULL;
 430     UBool               retVal     = TRUE;
 431
 432     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
 433     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
 434     if (U_FAILURE(status)) {
 435         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
 436             line, u_errorName(status));
 437         return FALSE;
 438     }
 439
 440     UnicodeString inputString(text, -1, US_INV);
 441     UnicodeString unEscapedInput = inputString.unescape();
 442     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
 443     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
 444
 445     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
 446     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 447         // UTF-8 does not allow unpaired surrogates, so this could actually happen
 448         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
 449         return TRUE; // not a failure of the Regex engine
 450     }
 451     status = U_ZERO_ERROR; // buffer overflow
 452     textChars = new char[inputUTF8Length+1];
 453     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
 454     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
 455
 456     REMatcher = &REPattern->matcher(status)->reset(&inputText);
 457     if (U_FAILURE(status)) {
 458         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
 459             line, u_errorName(status));
 460         return FALSE;
 461     }
 462
 463     UBool actualmatch;
 464     actualmatch = REMatcher->lookingAt(status);
 465     if (U_FAILURE(status)) {
 466         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
 467             line, u_errorName(status));
 468         retVal =  FALSE;
 469     }
 470     if (actualmatch != looking) {
 471         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
 472         retVal = FALSE;
 473     }
 474
 475     status = U_ZERO_ERROR;
 476     actualmatch = REMatcher->matches(status);
 477     if (U_FAILURE(status)) {
 478         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
 479             line, u_errorName(status));
 480         retVal = FALSE;
 481     }
 482     if (actualmatch != match) {
 483         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
 484         retVal = FALSE;
 485     }
 486
 487     if (retVal == FALSE) {
 488         RegexPatternDump(REPattern);
 489     }
 490
 491     delete REPattern;
 492     delete REMatcher;
 493     utext_close(&inputText);
 494     utext_close(&pattern);
 495     delete[] textChars;
 496     return retVal;
 497 }
 498
 499
 500
 501 //---------------------------------------------------------------------------
 502 //
 503 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 504 //                       regex tests for incorrect patterns
 505 //
 506 //       usage:
 507 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 508 //
 509 //---------------------------------------------------------------------------
 510 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 511
 512 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 513                           UErrorCode expectedStatus, int32_t line) {
 514     UnicodeString       pattern(pat);
 515
 516     UErrorCode          status         = U_ZERO_ERROR;
 517     UParseError         pe;
 518     RegexPattern        *callerPattern = NULL;
 519
 520     //
 521     //  Compile the caller's pattern
 522     //
 523     UnicodeString patString(pat);
 524     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 525     if (status != expectedStatus) {
 526         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 527     } else {
 528         if (status != U_ZERO_ERROR) {
 529             if (pe.line != errLine || pe.offset != errCol) {
 530                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 531                     line, errLine, errCol, pe.line, pe.offset);
 532             }
 533         }
 534     }
 535
 536     delete callerPattern;
 537
 538     //
 539     //  Compile again, using a UTF-8-based UText
 540     //
 541     UText patternText = UTEXT_INITIALIZER;
 542     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
 543     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
 544     if (status != expectedStatus) {
 545         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 546     } else {
 547         if (status != U_ZERO_ERROR) {
 548             if (pe.line != errLine || pe.offset != errCol) {
 549                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 550                     line, errLine, errCol, pe.line, pe.offset);
 551             }
 552         }
 553     }
 554
 555     delete callerPattern;
 556     utext_close(&patternText);
 557 }
 558
 559
 560
 561 //---------------------------------------------------------------------------
 562 //
 563 //      Basic      Check for basic functionality of regex pattern matching.
 564 //                 Avoid the use of REGEX_FIND test macro, which has
 565 //                 substantial dependencies on basic Regex functionality.
 566 //
 567 //---------------------------------------------------------------------------
 568 void RegexTest::Basic() {
 569
 570
 571 //
 572 // Debug - slide failing test cases early
 573 //
 574 #if 0
 575     {
 576         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 577         UParseError pe;
 578         UErrorCode  status = U_ZERO_ERROR;
 579         RegexPattern *pattern;
 580         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
 581         RegexPatternDump(pattern);
 582         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
 583         UBool result = m->find();
 584         printf("result = %d\n", result);
 585         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 586         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 587     }
 588     exit(1);
 589 #endif
 590
 591
 592     //
 593     // Pattern with parentheses
 594     //
 595     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 596     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 597     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 598
 599     //
 600     // Patterns with *
 601     //
 602     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 603     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 604     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 605     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 606     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 607
 608     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 609     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 610
 611
 612     //
 613     //  Patterns with "."
 614     //
 615     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 616     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 617     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 618     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 619     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 620     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 621     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 622     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 623
 624     //
 625     //  Patterns with * applied to chars at end of literal string
 626     //
 627     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 628     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 629
 630     //
 631     //  Supplemental chars match as single chars, not a pair of surrogates.
 632     //
 633     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 634     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 635     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 636
 637
 638     //
 639     //  UnicodeSets in the pattern
 640     //
 641     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 642     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 643     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 644     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 645     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 646     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 647
 648     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 649     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 650     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 651     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 652     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 653
 654     //
 655     //   OR operator in patterns
 656     //
 657     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 658     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 659     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 660     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 661
 662     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 663     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 664     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 665     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 666     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 667     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 668
 669     //
 670     //  +
 671     //
 672     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 673     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 674     REGEX_TESTLM("b+", "", FALSE, FALSE);
 675     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 676     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 677     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 678
 679     //
 680     //   ?
 681     //
 682     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 683     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 684     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 685     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 686     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 687     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 688     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 689     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 690     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 691
 692     //
 693     //  Escape sequences that become single literal chars, handled internally
 694     //   by ICU's Unescape.
 695     //
 696
 697     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 698     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 699     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 700     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 701     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 702     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 703     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 704     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 705     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 706     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 707
 708     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 709     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 710
 711     // Escape of special chars in patterns
 712     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 713 }
 714
 715
 716 //---------------------------------------------------------------------------
 717 //
 718 //    UTextBasic   Check for quirks that are specific to the UText
 719 //                 implementation.
 720 //
 721 //---------------------------------------------------------------------------
 722 void RegexTest::UTextBasic() {
 723     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
 724     UErrorCode status = U_ZERO_ERROR;
 725     UText pattern = UTEXT_INITIALIZER;
 726     utext_openUTF8(&pattern, str_abc, -1, &status);
 727     RegexMatcher matcher(&pattern, 0, status);
 728     REGEX_CHECK_STATUS;
 729
 730     UText input = UTEXT_INITIALIZER;
 731     utext_openUTF8(&input, str_abc, -1, &status);
 732     REGEX_CHECK_STATUS;
 733     matcher.reset(&input);
 734     REGEX_CHECK_STATUS;
 735     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 736
 737     matcher.reset(matcher.inputText());
 738     REGEX_CHECK_STATUS;
 739     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
 740
 741     utext_close(&pattern);
 742     utext_close(&input);
 743 }
 744
 745
 746 //---------------------------------------------------------------------------
 747 //
 748 //      API_Match   Test that the API for class RegexMatcher
 749 //                  is present and nominally working, but excluding functions
 750 //                  implementing replace operations.
 751 //
 752 //---------------------------------------------------------------------------
 753 void RegexTest::API_Match() {
 754     UParseError         pe;
 755     UErrorCode          status=U_ZERO_ERROR;
 756     int32_t             flags = 0;
 757
 758     //
 759     // Debug - slide failing test cases early
 760     //
 761 #if 0
 762     {
 763     }
 764     return;
 765 #endif
 766
 767     //
 768     // Simple pattern compilation
 769     //
 770     {
 771         UnicodeString       re("abc");
 772         RegexPattern        *pat2;
 773         pat2 = RegexPattern::compile(re, flags, pe, status);
 774         REGEX_CHECK_STATUS;
 775
 776         UnicodeString inStr1 = "abcdef this is a test";
 777         UnicodeString instr2 = "not abc";
 778         UnicodeString empty  = "";
 779
 780
 781         //
 782         // Matcher creation and reset.
 783         //
 784         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 785         REGEX_CHECK_STATUS;
 786         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 787         REGEX_ASSERT(m1->input() == inStr1);
 788         m1->reset(instr2);
 789         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 790         REGEX_ASSERT(m1->input() == instr2);
 791         m1->reset(inStr1);
 792         REGEX_ASSERT(m1->input() == inStr1);
 793         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 794         m1->reset(empty);
 795         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 796         REGEX_ASSERT(m1->input() == empty);
 797         REGEX_ASSERT(&m1->pattern() == pat2);
 798
 799         //
 800         //  reset(pos, status)
 801         //
 802         m1->reset(inStr1);
 803         m1->reset(4, status);
 804         REGEX_CHECK_STATUS;
 805         REGEX_ASSERT(m1->input() == inStr1);
 806         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 807
 808         m1->reset(-1, status);
 809         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 810         status = U_ZERO_ERROR;
 811
 812         m1->reset(0, status);
 813         REGEX_CHECK_STATUS;
 814         status = U_ZERO_ERROR;
 815
 816         int32_t len = m1->input().length();
 817         m1->reset(len-1, status);
 818         REGEX_CHECK_STATUS;
 819         status = U_ZERO_ERROR;
 820
 821         m1->reset(len, status);
 822         REGEX_CHECK_STATUS;
 823         status = U_ZERO_ERROR;
 824
 825         m1->reset(len+1, status);
 826         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 827         status = U_ZERO_ERROR;
 828
 829         //
 830         // match(pos, status)
 831         //
 832         m1->reset(instr2);
 833         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 834         m1->reset();
 835         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 836         m1->reset();
 837         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 838         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 839         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 840         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 841
 842         // Match() at end of string should fail, but should not
 843         //  be an error.
 844         status = U_ZERO_ERROR;
 845         len = m1->input().length();
 846         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 847         REGEX_CHECK_STATUS;
 848
 849         // Match beyond end of string should fail with an error.
 850         status = U_ZERO_ERROR;
 851         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 852         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 853
 854         // Successful match at end of string.
 855         {
 856             status = U_ZERO_ERROR;
 857             RegexMatcher m("A?", 0, status);  // will match zero length string.
 858             REGEX_CHECK_STATUS;
 859             m.reset(inStr1);
 860             len = inStr1.length();
 861             REGEX_ASSERT(m.matches(len, status) == TRUE);
 862             REGEX_CHECK_STATUS;
 863             m.reset(empty);
 864             REGEX_ASSERT(m.matches(0, status) == TRUE);
 865             REGEX_CHECK_STATUS;
 866         }
 867
 868
 869         //
 870         // lookingAt(pos, status)
 871         //
 872         status = U_ZERO_ERROR;
 873         m1->reset(instr2);  // "not abc"
 874         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 875         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 876         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 877         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 878         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 879         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 880         status = U_ZERO_ERROR;
 881         len = m1->input().length();
 882         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 883         REGEX_CHECK_STATUS;
 884         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 885         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 886
 887         delete m1;
 888         delete pat2;
 889     }
 890
 891
 892     //
 893     // Capture Group.
 894     //     RegexMatcher::start();
 895     //     RegexMatcher::end();
 896     //     RegexMatcher::groupCount();
 897     //
 898     {
 899         int32_t             flags=0;
 900         UParseError         pe;
 901         UErrorCode          status=U_ZERO_ERROR;
 902
 903         UnicodeString       re("01(23(45)67)(.*)");
 904         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 905         REGEX_CHECK_STATUS;
 906         UnicodeString data = "0123456789";
 907
 908         RegexMatcher *matcher = pat->matcher(data, status);
 909         REGEX_CHECK_STATUS;
 910         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 911         static const int32_t matchStarts[] = {0,  2, 4, 8};
 912         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 913         int32_t i;
 914         for (i=0; i<4; i++) {
 915             int32_t actualStart = matcher->start(i, status);
 916             REGEX_CHECK_STATUS;
 917             if (actualStart != matchStarts[i]) {
 918                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 919                     __LINE__, i, matchStarts[i], actualStart);
 920             }
 921             int32_t actualEnd = matcher->end(i, status);
 922             REGEX_CHECK_STATUS;
 923             if (actualEnd != matchEnds[i]) {
 924                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 925                     __LINE__, i, matchEnds[i], actualEnd);
 926             }
 927         }
 928
 929         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 930         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 931
 932         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 933         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 934         matcher->reset();
 935         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 936
 937         matcher->lookingAt(status);
 938         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 939         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 940         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 941         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 942         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 943         REGEX_CHECK_STATUS;
 944         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 945         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 946         matcher->reset();
 947         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 948
 949         delete matcher;
 950         delete pat;
 951
 952     }
 953
 954     //
 955     //  find
 956     //
 957     {
 958         int32_t             flags=0;
 959         UParseError         pe;
 960         UErrorCode          status=U_ZERO_ERROR;
 961
 962         UnicodeString       re("abc");
 963         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 964         REGEX_CHECK_STATUS;
 965         UnicodeString data = ".abc..abc...abc..";
 966         //                    012345678901234567
 967
 968         RegexMatcher *matcher = pat->matcher(data, status);
 969         REGEX_CHECK_STATUS;
 970         REGEX_ASSERT(matcher->find());
 971         REGEX_ASSERT(matcher->start(status) == 1);
 972         REGEX_ASSERT(matcher->find());
 973         REGEX_ASSERT(matcher->start(status) == 6);
 974         REGEX_ASSERT(matcher->find());
 975         REGEX_ASSERT(matcher->start(status) == 12);
 976         REGEX_ASSERT(matcher->find() == FALSE);
 977         REGEX_ASSERT(matcher->find() == FALSE);
 978
 979         matcher->reset();
 980         REGEX_ASSERT(matcher->find());
 981         REGEX_ASSERT(matcher->start(status) == 1);
 982
 983         REGEX_ASSERT(matcher->find(0, status));
 984         REGEX_ASSERT(matcher->start(status) == 1);
 985         REGEX_ASSERT(matcher->find(1, status));
 986         REGEX_ASSERT(matcher->start(status) == 1);
 987         REGEX_ASSERT(matcher->find(2, status));
 988         REGEX_ASSERT(matcher->start(status) == 6);
 989         REGEX_ASSERT(matcher->find(12, status));
 990         REGEX_ASSERT(matcher->start(status) == 12);
 991         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 992         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 993         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 994         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
 995
 996         status = U_ZERO_ERROR;
 997         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 998         status = U_ZERO_ERROR;
 999         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1000
1001         REGEX_ASSERT(matcher->groupCount() == 0);
1002
1003         delete matcher;
1004         delete pat;
1005     }
1006
1007
1008     //
1009     //  find, with \G in pattern (true if at the end of a previous match).
1010     //
1011     {
1012         int32_t             flags=0;
1013         UParseError         pe;
1014         UErrorCode          status=U_ZERO_ERROR;
1015
1016         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1017         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1018         REGEX_CHECK_STATUS;
1019         UnicodeString data = ".abcabc.abc..";
1020         //                    012345678901234567
1021
1022         RegexMatcher *matcher = pat->matcher(data, status);
1023         REGEX_CHECK_STATUS;
1024         REGEX_ASSERT(matcher->find());
1025         REGEX_ASSERT(matcher->start(status) == 0);
1026         REGEX_ASSERT(matcher->start(1, status) == -1);
1027         REGEX_ASSERT(matcher->start(2, status) == 1);
1028
1029         REGEX_ASSERT(matcher->find());
1030         REGEX_ASSERT(matcher->start(status) == 4);
1031         REGEX_ASSERT(matcher->start(1, status) == 4);
1032         REGEX_ASSERT(matcher->start(2, status) == -1);
1033         REGEX_CHECK_STATUS;
1034
1035         delete matcher;
1036         delete pat;
1037     }
1038
1039     //
1040     //   find with zero length matches, match position should bump ahead
1041     //     to prevent loops.
1042     //
1043     {
1044         int32_t                 i;
1045         UErrorCode          status=U_ZERO_ERROR;
1046         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1047                                                       //   using an always-true look-ahead.
1048         REGEX_CHECK_STATUS;
1049         UnicodeString s("    ");
1050         m.reset(s);
1051         for (i=0; ; i++) {
1052             if (m.find() == FALSE) {
1053                 break;
1054             }
1055             REGEX_ASSERT(m.start(status) == i);
1056             REGEX_ASSERT(m.end(status) == i);
1057         }
1058         REGEX_ASSERT(i==5);
1059
1060         // Check that the bump goes over surrogate pairs OK
1061         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1062         s = s.unescape();
1063         m.reset(s);
1064         for (i=0; ; i+=2) {
1065             if (m.find() == FALSE) {
1066                 break;
1067             }
1068             REGEX_ASSERT(m.start(status) == i);
1069             REGEX_ASSERT(m.end(status) == i);
1070         }
1071         REGEX_ASSERT(i==10);
1072     }
1073     {
1074         // find() loop breaking test.
1075         //        with pattern of /.?/, should see a series of one char matches, then a single
1076         //        match of zero length at the end of the input string.
1077         int32_t                 i;
1078         UErrorCode          status=U_ZERO_ERROR;
1079         RegexMatcher        m(".?", 0, status);
1080         REGEX_CHECK_STATUS;
1081         UnicodeString s("    ");
1082         m.reset(s);
1083         for (i=0; ; i++) {
1084             if (m.find() == FALSE) {
1085                 break;
1086             }
1087             REGEX_ASSERT(m.start(status) == i);
1088             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1089         }
1090         REGEX_ASSERT(i==5);
1091     }
1092
1093
1094     //
1095     // Matchers with no input string behave as if they had an empty input string.
1096     //
1097
1098     {
1099         UErrorCode status = U_ZERO_ERROR;
1100         RegexMatcher  m(".?", 0, status);
1101         REGEX_CHECK_STATUS;
1102         REGEX_ASSERT(m.find());
1103         REGEX_ASSERT(m.start(status) == 0);
1104         REGEX_ASSERT(m.input() == "");
1105     }
1106     {
1107         UErrorCode status = U_ZERO_ERROR;
1108         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1109         RegexMatcher  *m = p->matcher(status);
1110         REGEX_CHECK_STATUS;
1111
1112         REGEX_ASSERT(m->find() == FALSE);
1113         REGEX_ASSERT(m->input() == "");
1114         delete m;
1115         delete p;
1116     }
1117
1118     //
1119     // Regions
1120     //
1121     {
1122         UErrorCode status = U_ZERO_ERROR;
1123         UnicodeString testString("This is test data");
1124         RegexMatcher m(".*", testString,  0, status);
1125         REGEX_CHECK_STATUS;
1126         REGEX_ASSERT(m.regionStart() == 0);
1127         REGEX_ASSERT(m.regionEnd() == testString.length());
1128         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1129         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130
1131         m.region(2,4, status);
1132         REGEX_CHECK_STATUS;
1133         REGEX_ASSERT(m.matches(status));
1134         REGEX_ASSERT(m.start(status)==2);
1135         REGEX_ASSERT(m.end(status)==4);
1136         REGEX_CHECK_STATUS;
1137
1138         m.reset();
1139         REGEX_ASSERT(m.regionStart() == 0);
1140         REGEX_ASSERT(m.regionEnd() == testString.length());
1141
1142         UnicodeString shorterString("short");
1143         m.reset(shorterString);
1144         REGEX_ASSERT(m.regionStart() == 0);
1145         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1146
1147         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1148         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1149         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1150         REGEX_ASSERT(&m == &m.reset());
1151         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1152
1153         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1154         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1155         REGEX_ASSERT(&m == &m.reset());
1156         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1157
1158         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1159         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1160         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1161         REGEX_ASSERT(&m == &m.reset());
1162         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1163
1164         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1165         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1166         REGEX_ASSERT(&m == &m.reset());
1167         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1168
1169     }
1170
1171     //
1172     // hitEnd() and requireEnd()
1173     //
1174     {
1175         UErrorCode status = U_ZERO_ERROR;
1176         UnicodeString testString("aabb");
1177         RegexMatcher m1(".*", testString,  0, status);
1178         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1179         REGEX_ASSERT(m1.hitEnd() == TRUE);
1180         REGEX_ASSERT(m1.requireEnd() == FALSE);
1181         REGEX_CHECK_STATUS;
1182
1183         status = U_ZERO_ERROR;
1184         RegexMatcher m2("a*", testString, 0, status);
1185         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1186         REGEX_ASSERT(m2.hitEnd() == FALSE);
1187         REGEX_ASSERT(m2.requireEnd() == FALSE);
1188         REGEX_CHECK_STATUS;
1189
1190         status = U_ZERO_ERROR;
1191         RegexMatcher m3(".*$", testString, 0, status);
1192         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1193         REGEX_ASSERT(m3.hitEnd() == TRUE);
1194         REGEX_ASSERT(m3.requireEnd() == TRUE);
1195         REGEX_CHECK_STATUS;
1196     }
1197
1198
1199     //
1200     // Compilation error on reset with UChar *
1201     //   These were a hazard that people were stumbling over with runtime errors.
1202     //   Changed them to compiler errors by adding private methods that more closely
1203     //   matched the incorrect use of the functions.
1204     //
1205 #if 0
1206     {
1207         UErrorCode status = U_ZERO_ERROR;
1208         UChar ucharString[20];
1209         RegexMatcher m(".", 0, status);
1210         m.reset(ucharString);  // should not compile.
1211
1212         RegexPattern *p = RegexPattern::compile(".", 0, status);
1213         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1214
1215         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1216     }
1217 #endif
1218
1219     //
1220     //  Time Outs.
1221     //       Note:  These tests will need to be changed when the regexp engine is
1222     //              able to detect and cut short the exponential time behavior on
1223     //              this type of match.
1224     //
1225     {
1226         UErrorCode status = U_ZERO_ERROR;
1227         //    Enough 'a's in the string to cause the match to time out.
1228         //       (Each on additonal 'a' doubles the time)
1229         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1230         RegexMatcher matcher("(a+)+b", testString, 0, status);
1231         REGEX_CHECK_STATUS;
1232         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1233         matcher.setTimeLimit(100, status);
1234         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1235         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1236         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1237     }
1238     {
1239         UErrorCode status = U_ZERO_ERROR;
1240         //   Few enough 'a's to slip in under the time limit.
1241         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1242         RegexMatcher matcher("(a+)+b", testString, 0, status);
1243         REGEX_CHECK_STATUS;
1244         matcher.setTimeLimit(100, status);
1245         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1246         REGEX_CHECK_STATUS;
1247     }
1248
1249     //
1250     //  Stack Limits
1251     //
1252     {
1253         UErrorCode status = U_ZERO_ERROR;
1254         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1255
1256         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1257         //   of the '+', and makes the stack frames larger.
1258         RegexMatcher matcher("(A)+A$", testString, 0, status);
1259
1260         // With the default stack, this match should fail to run
1261         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1262         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1263
1264         // With unlimited stack, it should run
1265         status = U_ZERO_ERROR;
1266         matcher.setStackLimit(0, status);
1267         REGEX_CHECK_STATUS;
1268         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1269         REGEX_CHECK_STATUS;
1270         REGEX_ASSERT(matcher.getStackLimit() == 0);
1271
1272         // With a limited stack, it the match should fail
1273         status = U_ZERO_ERROR;
1274         matcher.setStackLimit(10000, status);
1275         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1276         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1277         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1278     }
1279
1280         // A pattern that doesn't save state should work with
1281         //   a minimal sized stack
1282     {
1283         UErrorCode status = U_ZERO_ERROR;
1284         UnicodeString testString = "abc";
1285         RegexMatcher matcher("abc", testString, 0, status);
1286         REGEX_CHECK_STATUS;
1287         matcher.setStackLimit(30, status);
1288         REGEX_CHECK_STATUS;
1289         REGEX_ASSERT(matcher.matches(status) == TRUE);
1290         REGEX_CHECK_STATUS;
1291         REGEX_ASSERT(matcher.getStackLimit() == 30);
1292
1293         // Negative stack sizes should fail
1294         status = U_ZERO_ERROR;
1295         matcher.setStackLimit(1000, status);
1296         REGEX_CHECK_STATUS;
1297         matcher.setStackLimit(-1, status);
1298         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1299         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1300     }
1301
1302
1303 }
1304
1305
1306
1307
1308
1309
1310 //---------------------------------------------------------------------------
1311 //
1312 //      API_Replace        API test for class RegexMatcher, testing the
1313 //                         Replace family of functions.
1314 //
1315 //---------------------------------------------------------------------------
1316 void RegexTest::API_Replace() {
1317     //
1318     //  Replace
1319     //
1320     int32_t             flags=0;
1321     UParseError         pe;
1322     UErrorCode          status=U_ZERO_ERROR;
1323
1324     UnicodeString       re("abc");
1325     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1326     REGEX_CHECK_STATUS;
1327     UnicodeString data = ".abc..abc...abc..";
1328     //                    012345678901234567
1329     RegexMatcher *matcher = pat->matcher(data, status);
1330
1331     //
1332     //  Plain vanilla matches.
1333     //
1334     UnicodeString  dest;
1335     dest = matcher->replaceFirst("yz", status);
1336     REGEX_CHECK_STATUS;
1337     REGEX_ASSERT(dest == ".yz..abc...abc..");
1338
1339     dest = matcher->replaceAll("yz", status);
1340     REGEX_CHECK_STATUS;
1341     REGEX_ASSERT(dest == ".yz..yz...yz..");
1342
1343     //
1344     //  Plain vanilla non-matches.
1345     //
1346     UnicodeString d2 = ".abx..abx...abx..";
1347     matcher->reset(d2);
1348     dest = matcher->replaceFirst("yz", status);
1349     REGEX_CHECK_STATUS;
1350     REGEX_ASSERT(dest == ".abx..abx...abx..");
1351
1352     dest = matcher->replaceAll("yz", status);
1353     REGEX_CHECK_STATUS;
1354     REGEX_ASSERT(dest == ".abx..abx...abx..");
1355
1356     //
1357     // Empty source string
1358     //
1359     UnicodeString d3 = "";
1360     matcher->reset(d3);
1361     dest = matcher->replaceFirst("yz", status);
1362     REGEX_CHECK_STATUS;
1363     REGEX_ASSERT(dest == "");
1364
1365     dest = matcher->replaceAll("yz", status);
1366     REGEX_CHECK_STATUS;
1367     REGEX_ASSERT(dest == "");
1368
1369     //
1370     // Empty substitution string
1371     //
1372     matcher->reset(data);              // ".abc..abc...abc.."
1373     dest = matcher->replaceFirst("", status);
1374     REGEX_CHECK_STATUS;
1375     REGEX_ASSERT(dest == "...abc...abc..");
1376
1377     dest = matcher->replaceAll("", status);
1378     REGEX_CHECK_STATUS;
1379     REGEX_ASSERT(dest == "........");
1380
1381     //
1382     // match whole string
1383     //
1384     UnicodeString d4 = "abc";
1385     matcher->reset(d4);
1386     dest = matcher->replaceFirst("xyz", status);
1387     REGEX_CHECK_STATUS;
1388     REGEX_ASSERT(dest == "xyz");
1389
1390     dest = matcher->replaceAll("xyz", status);
1391     REGEX_CHECK_STATUS;
1392     REGEX_ASSERT(dest == "xyz");
1393
1394     //
1395     // Capture Group, simple case
1396     //
1397     UnicodeString       re2("a(..)");
1398     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1399     REGEX_CHECK_STATUS;
1400     UnicodeString d5 = "abcdefg";
1401     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1402     REGEX_CHECK_STATUS;
1403     dest = matcher2->replaceFirst("$1$1", status);
1404     REGEX_CHECK_STATUS;
1405     REGEX_ASSERT(dest == "bcbcdefg");
1406
1407     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1408     REGEX_CHECK_STATUS;
1409     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1410
1411     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1412     REGEX_CHECK_STATUS;
1413     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1414
1415     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1416     replacement = replacement.unescape();
1417     dest = matcher2->replaceFirst(replacement, status);
1418     REGEX_CHECK_STATUS;
1419     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1420
1421     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1422
1423
1424     //
1425     // Replacement String with \u hex escapes
1426     //
1427     {
1428         UnicodeString  src = "abc 1 abc 2 abc 3";
1429         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1430         matcher->reset(src);
1431         UnicodeString  result = matcher->replaceAll(substitute, status);
1432         REGEX_CHECK_STATUS;
1433         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1434     }
1435     {
1436         UnicodeString  src = "abc !";
1437         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1438         matcher->reset(src);
1439         UnicodeString  result = matcher->replaceAll(substitute, status);
1440         REGEX_CHECK_STATUS;
1441         UnicodeString expected = UnicodeString("--");
1442         expected.append((UChar32)0x10000);
1443         expected.append("-- !");
1444         REGEX_ASSERT(result == expected);
1445     }
1446     // TODO:  need more through testing of capture substitutions.
1447
1448     // Bug 4057
1449     //
1450     {
1451         status = U_ZERO_ERROR;
1452         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1453         RegexMatcher m("ss(.*?)ee", 0, status);
1454         REGEX_CHECK_STATUS;
1455         UnicodeString result;
1456
1457         // Multiple finds do NOT bump up the previous appendReplacement postion.
1458         m.reset(s);
1459         m.find();
1460         m.find();
1461         m.appendReplacement(result, "ooh", status);
1462         REGEX_CHECK_STATUS;
1463         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1464
1465         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1466         status = U_ZERO_ERROR;
1467         result.truncate(0);
1468         m.reset(10, status);
1469         m.find();
1470         m.find();
1471         m.appendReplacement(result, "ooh", status);
1472         REGEX_CHECK_STATUS;
1473         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1474
1475         // find() at interior of string, appendReplacemnt still starts at beginning.
1476         status = U_ZERO_ERROR;
1477         result.truncate(0);
1478         m.reset();
1479         m.find(10, status);
1480         m.find();
1481         m.appendReplacement(result, "ooh", status);
1482         REGEX_CHECK_STATUS;
1483         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1484
1485         m.appendTail(result);
1486         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1487
1488     }
1489
1490     delete matcher2;
1491     delete pat2;
1492     delete matcher;
1493     delete pat;
1494 }
1495
1496
1497 //---------------------------------------------------------------------------
1498 //
1499 //      API_Pattern       Test that the API for class RegexPattern is
1500 //                        present and nominally working.
1501 //
1502 //---------------------------------------------------------------------------
1503 void RegexTest::API_Pattern() {
1504     RegexPattern        pata;    // Test default constructor to not crash.
1505     RegexPattern        patb;
1506
1507     REGEX_ASSERT(pata == patb);
1508     REGEX_ASSERT(pata == pata);
1509
1510     UnicodeString re1("abc[a-l][m-z]");
1511     UnicodeString re2("def");
1512     UErrorCode    status = U_ZERO_ERROR;
1513     UParseError   pe;
1514
1515     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1516     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1517     REGEX_CHECK_STATUS;
1518     REGEX_ASSERT(*pat1 == *pat1);
1519     REGEX_ASSERT(*pat1 != pata);
1520
1521     // Assign
1522     patb = *pat1;
1523     REGEX_ASSERT(patb == *pat1);
1524
1525     // Copy Construct
1526     RegexPattern patc(*pat1);
1527     REGEX_ASSERT(patc == *pat1);
1528     REGEX_ASSERT(patb == patc);
1529     REGEX_ASSERT(pat1 != pat2);
1530     patb = *pat2;
1531     REGEX_ASSERT(patb != patc);
1532     REGEX_ASSERT(patb == *pat2);
1533
1534     // Compile with no flags.
1535     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1536     REGEX_ASSERT(*pat1a == *pat1);
1537
1538     REGEX_ASSERT(pat1a->flags() == 0);
1539
1540     // Compile with different flags should be not equal
1541     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1542     REGEX_CHECK_STATUS;
1543
1544     REGEX_ASSERT(*pat1b != *pat1a);
1545     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1546     REGEX_ASSERT(pat1a->flags() == 0);
1547     delete pat1b;
1548
1549     // clone
1550     RegexPattern *pat1c = pat1->clone();
1551     REGEX_ASSERT(*pat1c == *pat1);
1552     REGEX_ASSERT(*pat1c != *pat2);
1553
1554     delete pat1c;
1555     delete pat1a;
1556     delete pat1;
1557     delete pat2;
1558
1559
1560     //
1561     //   Verify that a matcher created from a cloned pattern works.
1562     //     (Jitterbug 3423)
1563     //
1564     {
1565         UErrorCode     status     = U_ZERO_ERROR;
1566         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1567         RegexPattern  *pClone     = pSource->clone();
1568         delete         pSource;
1569         RegexMatcher  *mFromClone = pClone->matcher(status);
1570         REGEX_CHECK_STATUS;
1571         UnicodeString s = "Hello World";
1572         mFromClone->reset(s);
1573         REGEX_ASSERT(mFromClone->find() == TRUE);
1574         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1575         REGEX_ASSERT(mFromClone->find() == TRUE);
1576         REGEX_ASSERT(mFromClone->group(status) == "World");
1577         REGEX_ASSERT(mFromClone->find() == FALSE);
1578         delete mFromClone;
1579         delete pClone;
1580     }
1581
1582     //
1583     //   matches convenience API
1584     //
1585     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1586     REGEX_CHECK_STATUS;
1587     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1588     REGEX_CHECK_STATUS;
1589     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1590     REGEX_CHECK_STATUS;
1591     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1592     REGEX_CHECK_STATUS;
1593     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1594     REGEX_CHECK_STATUS;
1595     status = U_INDEX_OUTOFBOUNDS_ERROR;
1596     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1597     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1598
1599
1600     //
1601     // Split()
1602     //
1603     status = U_ZERO_ERROR;
1604     pat1 = RegexPattern::compile(" +",  pe, status);
1605     REGEX_CHECK_STATUS;
1606     UnicodeString  fields[10];
1607
1608     int32_t n;
1609     n = pat1->split("Now is the time", fields, 10, status);
1610     REGEX_CHECK_STATUS;
1611     REGEX_ASSERT(n==4);
1612     REGEX_ASSERT(fields[0]=="Now");
1613     REGEX_ASSERT(fields[1]=="is");
1614     REGEX_ASSERT(fields[2]=="the");
1615     REGEX_ASSERT(fields[3]=="time");
1616     REGEX_ASSERT(fields[4]=="");
1617
1618     n = pat1->split("Now is the time", fields, 2, status);
1619     REGEX_CHECK_STATUS;
1620     REGEX_ASSERT(n==2);
1621     REGEX_ASSERT(fields[0]=="Now");
1622     REGEX_ASSERT(fields[1]=="is the time");
1623     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1624
1625     fields[1] = "*";
1626     status = U_ZERO_ERROR;
1627     n = pat1->split("Now is the time", fields, 1, status);
1628     REGEX_CHECK_STATUS;
1629     REGEX_ASSERT(n==1);
1630     REGEX_ASSERT(fields[0]=="Now is the time");
1631     REGEX_ASSERT(fields[1]=="*");
1632     status = U_ZERO_ERROR;
1633
1634     n = pat1->split("    Now       is the time   ", fields, 10, status);
1635     REGEX_CHECK_STATUS;
1636     REGEX_ASSERT(n==6);
1637     REGEX_ASSERT(fields[0]=="");
1638     REGEX_ASSERT(fields[1]=="Now");
1639     REGEX_ASSERT(fields[2]=="is");
1640     REGEX_ASSERT(fields[3]=="the");
1641     REGEX_ASSERT(fields[4]=="time");
1642     REGEX_ASSERT(fields[5]=="");
1643
1644     n = pat1->split("     ", fields, 10, status);
1645     REGEX_CHECK_STATUS;
1646     REGEX_ASSERT(n==2);
1647     REGEX_ASSERT(fields[0]=="");
1648     REGEX_ASSERT(fields[1]=="");
1649
1650     fields[0] = "foo";
1651     n = pat1->split("", fields, 10, status);
1652     REGEX_CHECK_STATUS;
1653     REGEX_ASSERT(n==0);
1654     REGEX_ASSERT(fields[0]=="foo");
1655
1656     delete pat1;
1657
1658     //  split, with a pattern with (capture)
1659     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1660     REGEX_CHECK_STATUS;
1661
1662     status = U_ZERO_ERROR;
1663     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1664     REGEX_CHECK_STATUS;
1665     REGEX_ASSERT(n==7);
1666     REGEX_ASSERT(fields[0]=="");
1667     REGEX_ASSERT(fields[1]=="a");
1668     REGEX_ASSERT(fields[2]=="Now is ");
1669     REGEX_ASSERT(fields[3]=="b");
1670     REGEX_ASSERT(fields[4]=="the time");
1671     REGEX_ASSERT(fields[5]=="c");
1672     REGEX_ASSERT(fields[6]=="");
1673     REGEX_ASSERT(status==U_ZERO_ERROR);
1674
1675     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1676     REGEX_CHECK_STATUS;
1677     REGEX_ASSERT(n==7);
1678     REGEX_ASSERT(fields[0]=="  ");
1679     REGEX_ASSERT(fields[1]=="a");
1680     REGEX_ASSERT(fields[2]=="Now is ");
1681     REGEX_ASSERT(fields[3]=="b");
1682     REGEX_ASSERT(fields[4]=="the time");
1683     REGEX_ASSERT(fields[5]=="c");
1684     REGEX_ASSERT(fields[6]=="");
1685
1686     status = U_ZERO_ERROR;
1687     fields[6] = "foo";
1688     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1689     REGEX_CHECK_STATUS;
1690     REGEX_ASSERT(n==6);
1691     REGEX_ASSERT(fields[0]=="  ");
1692     REGEX_ASSERT(fields[1]=="a");
1693     REGEX_ASSERT(fields[2]=="Now is ");
1694     REGEX_ASSERT(fields[3]=="b");
1695     REGEX_ASSERT(fields[4]=="the time");
1696     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1697     REGEX_ASSERT(fields[6]=="foo");
1698
1699     status = U_ZERO_ERROR;
1700     fields[5] = "foo";
1701     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1702     REGEX_CHECK_STATUS;
1703     REGEX_ASSERT(n==5);
1704     REGEX_ASSERT(fields[0]=="  ");
1705     REGEX_ASSERT(fields[1]=="a");
1706     REGEX_ASSERT(fields[2]=="Now is ");
1707     REGEX_ASSERT(fields[3]=="b");
1708     REGEX_ASSERT(fields[4]=="the time<c>");
1709     REGEX_ASSERT(fields[5]=="foo");
1710
1711     status = U_ZERO_ERROR;
1712     fields[5] = "foo";
1713     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1714     REGEX_CHECK_STATUS;
1715     REGEX_ASSERT(n==5);
1716     REGEX_ASSERT(fields[0]=="  ");
1717     REGEX_ASSERT(fields[1]=="a");
1718     REGEX_ASSERT(fields[2]=="Now is ");
1719     REGEX_ASSERT(fields[3]=="b");
1720     REGEX_ASSERT(fields[4]=="the time");
1721     REGEX_ASSERT(fields[5]=="foo");
1722
1723     status = U_ZERO_ERROR;
1724     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1725     REGEX_CHECK_STATUS;
1726     REGEX_ASSERT(n==4);
1727     REGEX_ASSERT(fields[0]=="  ");
1728     REGEX_ASSERT(fields[1]=="a");
1729     REGEX_ASSERT(fields[2]=="Now is ");
1730     REGEX_ASSERT(fields[3]=="the time<c>");
1731     status = U_ZERO_ERROR;
1732     delete pat1;
1733
1734     pat1 = RegexPattern::compile("([-,])",  pe, status);
1735     REGEX_CHECK_STATUS;
1736     n = pat1->split("1-10,20", fields, 10, status);
1737     REGEX_CHECK_STATUS;
1738     REGEX_ASSERT(n==5);
1739     REGEX_ASSERT(fields[0]=="1");
1740     REGEX_ASSERT(fields[1]=="-");
1741     REGEX_ASSERT(fields[2]=="10");
1742     REGEX_ASSERT(fields[3]==",");
1743     REGEX_ASSERT(fields[4]=="20");
1744     delete pat1;
1745
1746     // Test split of string with empty trailing fields
1747     pat1 = RegexPattern::compile(",", pe, status);
1748     REGEX_CHECK_STATUS;
1749     n = pat1->split("a,b,c,", fields, 10, status);
1750     REGEX_CHECK_STATUS;
1751     REGEX_ASSERT(n==4);
1752     REGEX_ASSERT(fields[0]=="a");
1753     REGEX_ASSERT(fields[1]=="b");
1754     REGEX_ASSERT(fields[2]=="c");
1755     REGEX_ASSERT(fields[3]=="");
1756
1757     n = pat1->split("a,,,", fields, 10, status);
1758     REGEX_CHECK_STATUS;
1759     REGEX_ASSERT(n==4);
1760     REGEX_ASSERT(fields[0]=="a");
1761     REGEX_ASSERT(fields[1]=="");
1762     REGEX_ASSERT(fields[2]=="");
1763     REGEX_ASSERT(fields[3]=="");
1764     delete pat1;
1765
1766     // Split Separator with zero length match.
1767     pat1 = RegexPattern::compile(":?", pe, status);
1768     REGEX_CHECK_STATUS;
1769     n = pat1->split("abc", fields, 10, status);
1770     REGEX_CHECK_STATUS;
1771     REGEX_ASSERT(n==5);
1772     REGEX_ASSERT(fields[0]=="");
1773     REGEX_ASSERT(fields[1]=="a");
1774     REGEX_ASSERT(fields[2]=="b");
1775     REGEX_ASSERT(fields[3]=="c");
1776     REGEX_ASSERT(fields[4]=="");
1777
1778     delete pat1;
1779
1780     //
1781     // RegexPattern::pattern()
1782     //
1783     pat1 = new RegexPattern();
1784     REGEX_ASSERT(pat1->pattern() == "");
1785     delete pat1;
1786
1787     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1788     REGEX_CHECK_STATUS;
1789     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1790     delete pat1;
1791
1792
1793     //
1794     // classID functions
1795     //
1796     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1797     REGEX_CHECK_STATUS;
1798     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1799     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1800     UnicodeString Hello("Hello, world.");
1801     RegexMatcher *m = pat1->matcher(Hello, status);
1802     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1803     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1804     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1805     delete m;
1806     delete pat1;
1807
1808 }
1809
1810 //---------------------------------------------------------------------------
1811 //
1812 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1813 //                       is present and working, but excluding functions
1814 //                       implementing replace operations.
1815 //
1816 //---------------------------------------------------------------------------
1817 void RegexTest::API_Match_UTF8() {
1818     UParseError         pe;
1819     UErrorCode          status=U_ZERO_ERROR;
1820     int32_t             flags = 0;
1821
1822     //
1823     // Debug - slide failing test cases early
1824     //
1825 #if 0
1826     {
1827     }
1828     return;
1829 #endif
1830
1831     //
1832     // Simple pattern compilation
1833     //
1834     {
1835         UText               re = UTEXT_INITIALIZER;
1836         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1837         REGEX_VERBOSE_TEXT(&re);
1838         RegexPattern        *pat2;
1839         pat2 = RegexPattern::compile(&re, flags, pe, status);
1840         REGEX_CHECK_STATUS;
1841
1842         UText input1 = UTEXT_INITIALIZER;
1843         UText input2 = UTEXT_INITIALIZER;
1844         UText empty  = UTEXT_INITIALIZER;
1845         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1846         REGEX_VERBOSE_TEXT(&input1);
1847         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1848         REGEX_VERBOSE_TEXT(&input2);
1849         utext_openUChars(&empty, NULL, 0, &status);
1850
1851         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1852         int32_t input2Len = strlen("not abc");
1853
1854
1855         //
1856         // Matcher creation and reset.
1857         //
1858         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1859         REGEX_CHECK_STATUS;
1860         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1861         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1862         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1863         m1->reset(&input2);
1864         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1865         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1866         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1867         m1->reset(&input1);
1868         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1870         m1->reset(&empty);
1871         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1872         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1873
1874         //
1875         //  reset(pos, status)
1876         //
1877         m1->reset(&input1);
1878         m1->reset(4, status);
1879         REGEX_CHECK_STATUS;
1880         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1881         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1882
1883         m1->reset(-1, status);
1884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1885         status = U_ZERO_ERROR;
1886
1887         m1->reset(0, status);
1888         REGEX_CHECK_STATUS;
1889         status = U_ZERO_ERROR;
1890
1891         m1->reset(input1Len-1, status);
1892         REGEX_CHECK_STATUS;
1893         status = U_ZERO_ERROR;
1894
1895         m1->reset(input1Len, status);
1896         REGEX_CHECK_STATUS;
1897         status = U_ZERO_ERROR;
1898
1899         m1->reset(input1Len+1, status);
1900         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1901         status = U_ZERO_ERROR;
1902
1903         //
1904         // match(pos, status)
1905         //
1906         m1->reset(&input2);
1907         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1908         m1->reset();
1909         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1910         m1->reset();
1911         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1912         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1913         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1914         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1915
1916         // Match() at end of string should fail, but should not
1917         //  be an error.
1918         status = U_ZERO_ERROR;
1919         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1920         REGEX_CHECK_STATUS;
1921
1922         // Match beyond end of string should fail with an error.
1923         status = U_ZERO_ERROR;
1924         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1926
1927         // Successful match at end of string.
1928         {
1929             status = U_ZERO_ERROR;
1930             RegexMatcher m("A?", 0, status);  // will match zero length string.
1931             REGEX_CHECK_STATUS;
1932             m.reset(&input1);
1933             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1934             REGEX_CHECK_STATUS;
1935             m.reset(&empty);
1936             REGEX_ASSERT(m.matches(0, status) == TRUE);
1937             REGEX_CHECK_STATUS;
1938         }
1939
1940
1941         //
1942         // lookingAt(pos, status)
1943         //
1944         status = U_ZERO_ERROR;
1945         m1->reset(&input2);  // "not abc"
1946         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1947         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1948         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1949         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1950         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1952         status = U_ZERO_ERROR;
1953         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1954         REGEX_CHECK_STATUS;
1955         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1956         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1957
1958         delete m1;
1959         delete pat2;
1960
1961         utext_close(&re);
1962         utext_close(&input1);
1963         utext_close(&input2);
1964         utext_close(&empty);
1965     }
1966
1967
1968     //
1969     // Capture Group.
1970     //     RegexMatcher::start();
1971     //     RegexMatcher::end();
1972     //     RegexMatcher::groupCount();
1973     //
1974     {
1975         int32_t             flags=0;
1976         UParseError         pe;
1977         UErrorCode          status=U_ZERO_ERROR;
1978         UText               re=UTEXT_INITIALIZER;
1979         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1980         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1981
1982         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1983         REGEX_CHECK_STATUS;
1984
1985         UText input = UTEXT_INITIALIZER;
1986         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1987         utext_openUTF8(&input, str_0123456789, -1, &status);
1988
1989         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1990         REGEX_CHECK_STATUS;
1991         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1992         static const int32_t matchStarts[] = {0,  2, 4, 8};
1993         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1994         int32_t i;
1995         for (i=0; i<4; i++) {
1996             int32_t actualStart = matcher->start(i, status);
1997             REGEX_CHECK_STATUS;
1998             if (actualStart != matchStarts[i]) {
1999                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2000                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2001             }
2002             int32_t actualEnd = matcher->end(i, status);
2003             REGEX_CHECK_STATUS;
2004             if (actualEnd != matchEnds[i]) {
2005                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2006                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2007             }
2008         }
2009
2010         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2011         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2012
2013         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2014         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2015         matcher->reset();
2016         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2017
2018         matcher->lookingAt(status);
2019
2020         UnicodeString dest;
2021         UText destText = UTEXT_INITIALIZER;
2022         utext_openUnicodeString(&destText, &dest, &status);
2023         UText *result;
2024         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2025         //      Test shallow-clone API
2026         int64_t   group_len;
2027         result = matcher->group((UText *)NULL, group_len, status);
2028         REGEX_CHECK_STATUS;
2029         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2030         utext_close(result);
2031         result = matcher->group(0, &destText, group_len, status);
2032         REGEX_CHECK_STATUS;
2033         REGEX_ASSERT(result == &destText);
2034         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2035         //  destText is now immutable, reopen it
2036         utext_close(&destText);
2037         utext_openUnicodeString(&destText, &dest, &status);
2038
2039         result = matcher->group(0, NULL, status);
2040         REGEX_CHECK_STATUS;
2041         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2042         utext_close(result);
2043         result = matcher->group(0, &destText, status);
2044         REGEX_CHECK_STATUS;
2045         REGEX_ASSERT(result == &destText);
2046         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2047
2048         result = matcher->group(1, NULL, status);
2049         REGEX_CHECK_STATUS;
2050         const char str_234567[] = { 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x00 }; /* 234567 */
2051         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2052         utext_close(result);
2053         result = matcher->group(1, &destText, status);
2054         REGEX_CHECK_STATUS;
2055         REGEX_ASSERT(result == &destText);
2056         REGEX_ASSERT_UTEXT_UTF8(str_234567, result);
2057
2058         result = matcher->group(2, NULL, status);
2059         REGEX_CHECK_STATUS;
2060         const char str_45[] = { 0x34, 0x35, 0x00 }; /* 45 */
2061         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2062         utext_close(result);
2063         result = matcher->group(2, &destText, status);
2064         REGEX_CHECK_STATUS;
2065         REGEX_ASSERT(result == &destText);
2066         REGEX_ASSERT_UTEXT_UTF8(str_45, result);
2067
2068         result = matcher->group(3, NULL, status);
2069         REGEX_CHECK_STATUS;
2070         const char str_89[] = { 0x38, 0x39, 0x00 }; /* 89 */
2071         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2072         utext_close(result);
2073         result = matcher->group(3, &destText, status);
2074         REGEX_CHECK_STATUS;
2075         REGEX_ASSERT(result == &destText);
2076         REGEX_ASSERT_UTEXT_UTF8(str_89, result);
2077
2078         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2079         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2080         matcher->reset();
2081         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2082
2083         delete matcher;
2084         delete pat;
2085
2086         utext_close(&destText);
2087         utext_close(&input);
2088         utext_close(&re);
2089     }
2090
2091     //
2092     //  find
2093     //
2094     {
2095         int32_t             flags=0;
2096         UParseError         pe;
2097         UErrorCode          status=U_ZERO_ERROR;
2098         UText               re=UTEXT_INITIALIZER;
2099         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2100         utext_openUTF8(&re, str_abc, -1, &status);
2101
2102         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2103         REGEX_CHECK_STATUS;
2104         UText input = UTEXT_INITIALIZER;
2105         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2106         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2107         //                      012345678901234567
2108
2109         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2110         REGEX_CHECK_STATUS;
2111         REGEX_ASSERT(matcher->find());
2112         REGEX_ASSERT(matcher->start(status) == 1);
2113         REGEX_ASSERT(matcher->find());
2114         REGEX_ASSERT(matcher->start(status) == 6);
2115         REGEX_ASSERT(matcher->find());
2116         REGEX_ASSERT(matcher->start(status) == 12);
2117         REGEX_ASSERT(matcher->find() == FALSE);
2118         REGEX_ASSERT(matcher->find() == FALSE);
2119
2120         matcher->reset();
2121         REGEX_ASSERT(matcher->find());
2122         REGEX_ASSERT(matcher->start(status) == 1);
2123
2124         REGEX_ASSERT(matcher->find(0, status));
2125         REGEX_ASSERT(matcher->start(status) == 1);
2126         REGEX_ASSERT(matcher->find(1, status));
2127         REGEX_ASSERT(matcher->start(status) == 1);
2128         REGEX_ASSERT(matcher->find(2, status));
2129         REGEX_ASSERT(matcher->start(status) == 6);
2130         REGEX_ASSERT(matcher->find(12, status));
2131         REGEX_ASSERT(matcher->start(status) == 12);
2132         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2133         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2134         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2135         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2136
2137         status = U_ZERO_ERROR;
2138         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139         status = U_ZERO_ERROR;
2140         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2141
2142         REGEX_ASSERT(matcher->groupCount() == 0);
2143
2144         delete matcher;
2145         delete pat;
2146
2147         utext_close(&input);
2148         utext_close(&re);
2149     }
2150
2151
2152     //
2153     //  find, with \G in pattern (true if at the end of a previous match).
2154     //
2155     {
2156         int32_t             flags=0;
2157         UParseError         pe;
2158         UErrorCode          status=U_ZERO_ERROR;
2159         UText               re=UTEXT_INITIALIZER;
2160         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2161         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2162
2163         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2164
2165         REGEX_CHECK_STATUS;
2166         UText input = UTEXT_INITIALIZER;
2167         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2168         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2169         //                      012345678901234567
2170
2171         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2172         REGEX_CHECK_STATUS;
2173         REGEX_ASSERT(matcher->find());
2174         REGEX_ASSERT(matcher->start(status) == 0);
2175         REGEX_ASSERT(matcher->start(1, status) == -1);
2176         REGEX_ASSERT(matcher->start(2, status) == 1);
2177
2178         REGEX_ASSERT(matcher->find());
2179         REGEX_ASSERT(matcher->start(status) == 4);
2180         REGEX_ASSERT(matcher->start(1, status) == 4);
2181         REGEX_ASSERT(matcher->start(2, status) == -1);
2182         REGEX_CHECK_STATUS;
2183
2184         delete matcher;
2185         delete pat;
2186
2187         utext_close(&input);
2188         utext_close(&re);
2189     }
2190
2191     //
2192     //   find with zero length matches, match position should bump ahead
2193     //     to prevent loops.
2194     //
2195     {
2196         int32_t                 i;
2197         UErrorCode          status=U_ZERO_ERROR;
2198         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2199                                                       //   using an always-true look-ahead.
2200         REGEX_CHECK_STATUS;
2201         UText s = UTEXT_INITIALIZER;
2202         utext_openUTF8(&s, "    ", -1, &status);
2203         m.reset(&s);
2204         for (i=0; ; i++) {
2205             if (m.find() == FALSE) {
2206                 break;
2207             }
2208             REGEX_ASSERT(m.start(status) == i);
2209             REGEX_ASSERT(m.end(status) == i);
2210         }
2211         REGEX_ASSERT(i==5);
2212
2213         // Check that the bump goes over characters outside the BMP OK
2214         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2215         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2216         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2217         m.reset(&s);
2218         for (i=0; ; i+=4) {
2219             if (m.find() == FALSE) {
2220                 break;
2221             }
2222             REGEX_ASSERT(m.start(status) == i);
2223             REGEX_ASSERT(m.end(status) == i);
2224         }
2225         REGEX_ASSERT(i==20);
2226
2227         utext_close(&s);
2228     }
2229     {
2230         // find() loop breaking test.
2231         //        with pattern of /.?/, should see a series of one char matches, then a single
2232         //        match of zero length at the end of the input string.
2233         int32_t                 i;
2234         UErrorCode          status=U_ZERO_ERROR;
2235         RegexMatcher        m(".?", 0, status);
2236         REGEX_CHECK_STATUS;
2237         UText s = UTEXT_INITIALIZER;
2238         utext_openUTF8(&s, "    ", -1, &status);
2239         m.reset(&s);
2240         for (i=0; ; i++) {
2241             if (m.find() == FALSE) {
2242                 break;
2243             }
2244             REGEX_ASSERT(m.start(status) == i);
2245             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2246         }
2247         REGEX_ASSERT(i==5);
2248
2249         utext_close(&s);
2250     }
2251
2252
2253     //
2254     // Matchers with no input string behave as if they had an empty input string.
2255     //
2256
2257     {
2258         UErrorCode status = U_ZERO_ERROR;
2259         RegexMatcher  m(".?", 0, status);
2260         REGEX_CHECK_STATUS;
2261         REGEX_ASSERT(m.find());
2262         REGEX_ASSERT(m.start(status) == 0);
2263         REGEX_ASSERT(m.input() == "");
2264     }
2265     {
2266         UErrorCode status = U_ZERO_ERROR;
2267         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2268         RegexMatcher  *m = p->matcher(status);
2269         REGEX_CHECK_STATUS;
2270
2271         REGEX_ASSERT(m->find() == FALSE);
2272         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2273         delete m;
2274         delete p;
2275     }
2276
2277     //
2278     // Regions
2279     //
2280     {
2281         UErrorCode status = U_ZERO_ERROR;
2282         UText testPattern = UTEXT_INITIALIZER;
2283         UText testText    = UTEXT_INITIALIZER;
2284         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2285         REGEX_VERBOSE_TEXT(&testPattern);
2286         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2287         REGEX_VERBOSE_TEXT(&testText);
2288
2289         RegexMatcher m(&testPattern, &testText, 0, status);
2290         REGEX_CHECK_STATUS;
2291         REGEX_ASSERT(m.regionStart() == 0);
2292         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2293         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2294         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2295
2296         m.region(2,4, status);
2297         REGEX_CHECK_STATUS;
2298         REGEX_ASSERT(m.matches(status));
2299         REGEX_ASSERT(m.start(status)==2);
2300         REGEX_ASSERT(m.end(status)==4);
2301         REGEX_CHECK_STATUS;
2302
2303         m.reset();
2304         REGEX_ASSERT(m.regionStart() == 0);
2305         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2306
2307         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2308         REGEX_VERBOSE_TEXT(&testText);
2309         m.reset(&testText);
2310         REGEX_ASSERT(m.regionStart() == 0);
2311         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2312
2313         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2314         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2315         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316         REGEX_ASSERT(&m == &m.reset());
2317         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2318
2319         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2320         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321         REGEX_ASSERT(&m == &m.reset());
2322         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2323
2324         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2326         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327         REGEX_ASSERT(&m == &m.reset());
2328         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2329
2330         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2331         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332         REGEX_ASSERT(&m == &m.reset());
2333         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2334
2335         utext_close(&testText);
2336         utext_close(&testPattern);
2337     }
2338
2339     //
2340     // hitEnd() and requireEnd()
2341     //
2342     {
2343         UErrorCode status = U_ZERO_ERROR;
2344         UText testPattern = UTEXT_INITIALIZER;
2345         UText testText    = UTEXT_INITIALIZER;
2346         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2347         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2348         utext_openUTF8(&testPattern, str_, -1, &status);
2349         utext_openUTF8(&testText, str_aabb, -1, &status);
2350
2351         RegexMatcher m1(&testPattern, &testText,  0, status);
2352         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2353         REGEX_ASSERT(m1.hitEnd() == TRUE);
2354         REGEX_ASSERT(m1.requireEnd() == FALSE);
2355         REGEX_CHECK_STATUS;
2356
2357         status = U_ZERO_ERROR;
2358         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2359         utext_openUTF8(&testPattern, str_a, -1, &status);
2360         RegexMatcher m2(&testPattern, &testText, 0, status);
2361         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2362         REGEX_ASSERT(m2.hitEnd() == FALSE);
2363         REGEX_ASSERT(m2.requireEnd() == FALSE);
2364         REGEX_CHECK_STATUS;
2365
2366         status = U_ZERO_ERROR;
2367         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2368         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2369         RegexMatcher m3(&testPattern, &testText, 0, status);
2370         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2371         REGEX_ASSERT(m3.hitEnd() == TRUE);
2372         REGEX_ASSERT(m3.requireEnd() == TRUE);
2373         REGEX_CHECK_STATUS;
2374
2375         utext_close(&testText);
2376         utext_close(&testPattern);
2377     }
2378 }
2379
2380
2381 //---------------------------------------------------------------------------
2382 //
2383 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2384 //                         Replace family of functions.
2385 //
2386 //---------------------------------------------------------------------------
2387 void RegexTest::API_Replace_UTF8() {
2388     //
2389     //  Replace
2390     //
2391     int32_t             flags=0;
2392     UParseError         pe;
2393     UErrorCode          status=U_ZERO_ERROR;
2394
2395     UText               re=UTEXT_INITIALIZER;
2396     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2397     REGEX_VERBOSE_TEXT(&re);
2398     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2399     REGEX_CHECK_STATUS;
2400
2401     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2402     //             012345678901234567
2403     UText dataText = UTEXT_INITIALIZER;
2404     utext_openUTF8(&dataText, data, -1, &status);
2405     REGEX_CHECK_STATUS;
2406     REGEX_VERBOSE_TEXT(&dataText);
2407     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2408
2409     //
2410     //  Plain vanilla matches.
2411     //
2412     UnicodeString  dest;
2413     UText destText = UTEXT_INITIALIZER;
2414     utext_openUnicodeString(&destText, &dest, &status);
2415     UText *result;
2416
2417     UText replText = UTEXT_INITIALIZER;
2418
2419     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2420     utext_openUTF8(&replText, str_yz, -1, &status);
2421     REGEX_VERBOSE_TEXT(&replText);
2422     result = matcher->replaceFirst(&replText, NULL, status);
2423     REGEX_CHECK_STATUS;
2424     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2425     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2426     utext_close(result);
2427     result = matcher->replaceFirst(&replText, &destText, status);
2428     REGEX_CHECK_STATUS;
2429     REGEX_ASSERT(result == &destText);
2430     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2431
2432     result = matcher->replaceAll(&replText, NULL, status);
2433     REGEX_CHECK_STATUS;
2434     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2435     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2436     utext_close(result);
2437
2438     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2439     result = matcher->replaceAll(&replText, &destText, status);
2440     REGEX_CHECK_STATUS;
2441     REGEX_ASSERT(result == &destText);
2442     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2443
2444     //
2445     //  Plain vanilla non-matches.
2446     //
2447     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2448     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2449     matcher->reset(&dataText);
2450
2451     result = matcher->replaceFirst(&replText, NULL, status);
2452     REGEX_CHECK_STATUS;
2453     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2454     utext_close(result);
2455     result = matcher->replaceFirst(&replText, &destText, status);
2456     REGEX_CHECK_STATUS;
2457     REGEX_ASSERT(result == &destText);
2458     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2459
2460     result = matcher->replaceAll(&replText, NULL, status);
2461     REGEX_CHECK_STATUS;
2462     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2463     utext_close(result);
2464     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2465     result = matcher->replaceAll(&replText, &destText, status);
2466     REGEX_CHECK_STATUS;
2467     REGEX_ASSERT(result == &destText);
2468     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2469
2470     //
2471     // Empty source string
2472     //
2473     utext_openUTF8(&dataText, NULL, 0, &status);
2474     matcher->reset(&dataText);
2475
2476     result = matcher->replaceFirst(&replText, NULL, status);
2477     REGEX_CHECK_STATUS;
2478     REGEX_ASSERT_UTEXT_UTF8("", result);
2479     utext_close(result);
2480     result = matcher->replaceFirst(&replText, &destText, status);
2481     REGEX_CHECK_STATUS;
2482     REGEX_ASSERT(result == &destText);
2483     REGEX_ASSERT_UTEXT_UTF8("", result);
2484
2485     result = matcher->replaceAll(&replText, NULL, status);
2486     REGEX_CHECK_STATUS;
2487     REGEX_ASSERT_UTEXT_UTF8("", result);
2488     utext_close(result);
2489     result = matcher->replaceAll(&replText, &destText, status);
2490     REGEX_CHECK_STATUS;
2491     REGEX_ASSERT(result == &destText);
2492     REGEX_ASSERT_UTEXT_UTF8("", result);
2493
2494     //
2495     // Empty substitution string
2496     //
2497     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2498     matcher->reset(&dataText);
2499
2500     utext_openUTF8(&replText, NULL, 0, &status);
2501     result = matcher->replaceFirst(&replText, NULL, status);
2502     REGEX_CHECK_STATUS;
2503     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2504     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2505     utext_close(result);
2506     result = matcher->replaceFirst(&replText, &destText, status);
2507     REGEX_CHECK_STATUS;
2508     REGEX_ASSERT(result == &destText);
2509     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2510
2511     result = matcher->replaceAll(&replText, NULL, status);
2512     REGEX_CHECK_STATUS;
2513     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2514     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2515     utext_close(result);
2516     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2517     result = matcher->replaceAll(&replText, &destText, status);
2518     REGEX_CHECK_STATUS;
2519     REGEX_ASSERT(result == &destText);
2520     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2521
2522     //
2523     // match whole string
2524     //
2525     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2526     utext_openUTF8(&dataText, str_abc, -1, &status);
2527     matcher->reset(&dataText);
2528
2529     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2530     utext_openUTF8(&replText, str_xyz, -1, &status);
2531     result = matcher->replaceFirst(&replText, NULL, status);
2532     REGEX_CHECK_STATUS;
2533     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2534     utext_close(result);
2535     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2536     result = matcher->replaceFirst(&replText, &destText, status);
2537     REGEX_CHECK_STATUS;
2538     REGEX_ASSERT(result == &destText);
2539     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2540
2541     result = matcher->replaceAll(&replText, NULL, status);
2542     REGEX_CHECK_STATUS;
2543     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2544     utext_close(result);
2545     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2546     result = matcher->replaceAll(&replText, &destText, status);
2547     REGEX_CHECK_STATUS;
2548     REGEX_ASSERT(result == &destText);
2549     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2550
2551     //
2552     // Capture Group, simple case
2553     //
2554     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2555     utext_openUTF8(&re, str_add, -1, &status);
2556     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2557     REGEX_CHECK_STATUS;
2558
2559     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2560     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2561     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2562     REGEX_CHECK_STATUS;
2563
2564     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2565     utext_openUTF8(&replText, str_11, -1, &status);
2566     result = matcher2->replaceFirst(&replText, NULL, status);
2567     REGEX_CHECK_STATUS;
2568     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2569     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2570     utext_close(result);
2571     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2572     result = matcher2->replaceFirst(&replText, &destText, status);
2573     REGEX_CHECK_STATUS;
2574     REGEX_ASSERT(result == &destText);
2575     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2576
2577     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2578     utext_openUTF8(&replText, str_v, -1, &status);
2579     REGEX_VERBOSE_TEXT(&replText);
2580     result = matcher2->replaceFirst(&replText, NULL, status);
2581     REGEX_CHECK_STATUS;
2582     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2583     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2584     utext_close(result);
2585     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2586     result = matcher2->replaceFirst(&replText, &destText, status);
2587     REGEX_CHECK_STATUS;
2588     REGEX_ASSERT(result == &destText);
2589     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2590
2591     const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
2592     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593     result = matcher2->replaceFirst(&replText, NULL, status);
2594     REGEX_CHECK_STATUS;
2595     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597     utext_close(result);
2598     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599     result = matcher2->replaceFirst(&replText, &destText, status);
2600     REGEX_CHECK_STATUS;
2601     REGEX_ASSERT(result == &destText);
2602     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603
2604     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606     //                                 012345678901234567890123456
2607     supplDigitChars[22] = 0xF0;
2608     supplDigitChars[23] = 0x9D;
2609     supplDigitChars[24] = 0x9F;
2610     supplDigitChars[25] = 0x8F;
2611     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612
2613     result = matcher2->replaceFirst(&replText, NULL, status);
2614     REGEX_CHECK_STATUS;
2615     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617     utext_close(result);
2618     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619     result = matcher2->replaceFirst(&replText, &destText, status);
2620     REGEX_CHECK_STATUS;
2621     REGEX_ASSERT(result == &destText);
2622     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2624     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627     utext_close(result);
2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630     REGEX_ASSERT(result == &destText);
2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632
2633     //
2634     // Replacement String with \u hex escapes
2635     //
2636     {
2637       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640         utext_openUTF8(&replText, str_u0043, -1, &status);
2641         matcher->reset(&dataText);
2642
2643         result = matcher->replaceAll(&replText, NULL, status);
2644         REGEX_CHECK_STATUS;
2645         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647         utext_close(result);
2648         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649         result = matcher->replaceAll(&replText, &destText, status);
2650         REGEX_CHECK_STATUS;
2651         REGEX_ASSERT(result == &destText);
2652         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653     }
2654     {
2655       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656         utext_openUTF8(&dataText, str_abc, -1, &status);
2657         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658         utext_openUTF8(&replText, str_U00010000, -1, &status);
2659         matcher->reset(&dataText);
2660
2661         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662         //                          0123456789
2663         expected[2] = 0xF0;
2664         expected[3] = 0x90;
2665         expected[4] = 0x80;
2666         expected[5] = 0x80;
2667
2668         result = matcher->replaceAll(&replText, NULL, status);
2669         REGEX_CHECK_STATUS;
2670         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671         utext_close(result);
2672         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673         result = matcher->replaceAll(&replText, &destText, status);
2674         REGEX_CHECK_STATUS;
2675         REGEX_ASSERT(result == &destText);
2676         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677     }
2678     // TODO:  need more through testing of capture substitutions.
2679
2680     // Bug 4057
2681     //
2682     {
2683         status = U_ZERO_ERROR;
2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687         utext_openUTF8(&re, str_ssee, -1, &status);
2688         utext_openUTF8(&dataText, str_blah, -1, &status);
2689         utext_openUTF8(&replText, str_ooh, -1, &status);
2690
2691         RegexMatcher m(&re, 0, status);
2692         REGEX_CHECK_STATUS;
2693
2694         UnicodeString result;
2695         UText resultText = UTEXT_INITIALIZER;
2696         utext_openUnicodeString(&resultText, &result, &status);
2697
2698         // Multiple finds do NOT bump up the previous appendReplacement postion.
2699         m.reset(&dataText);
2700         m.find();
2701         m.find();
2702         m.appendReplacement(&resultText, &replText, status);
2703         REGEX_CHECK_STATUS;
2704         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706
2707         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708         status = U_ZERO_ERROR;
2709         result.truncate(0);
2710         utext_openUnicodeString(&resultText, &result, &status);
2711         m.reset(10, status);
2712         m.find();
2713         m.find();
2714         m.appendReplacement(&resultText, &replText, status);
2715         REGEX_CHECK_STATUS;
2716         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718
2719         // find() at interior of string, appendReplacement still starts at beginning.
2720         status = U_ZERO_ERROR;
2721         result.truncate(0);
2722         utext_openUnicodeString(&resultText, &result, &status);
2723         m.reset();
2724         m.find(10, status);
2725         m.find();
2726         m.appendReplacement(&resultText, &replText, status);
2727         REGEX_CHECK_STATUS;
2728         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730
2731         m.appendTail(&resultText, status);
2732         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734
2735         utext_close(&resultText);
2736     }
2737
2738     delete matcher2;
2739     delete pat2;
2740     delete matcher;
2741     delete pat;
2742
2743     utext_close(&dataText);
2744     utext_close(&replText);
2745     utext_close(&destText);
2746     utext_close(&re);
2747 }
2748
2749
2750 //---------------------------------------------------------------------------
2751 //
2752 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2753 //                        present and nominally working.
2754 //
2755 //---------------------------------------------------------------------------
2756 void RegexTest::API_Pattern_UTF8() {
2757     RegexPattern        pata;    // Test default constructor to not crash.
2758     RegexPattern        patb;
2759
2760     REGEX_ASSERT(pata == patb);
2761     REGEX_ASSERT(pata == pata);
2762
2763     UText         re1 = UTEXT_INITIALIZER;
2764     UText         re2 = UTEXT_INITIALIZER;
2765     UErrorCode    status = U_ZERO_ERROR;
2766     UParseError   pe;
2767
2768     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771     utext_openUTF8(&re2, str_def, -1, &status);
2772
2773     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775     REGEX_CHECK_STATUS;
2776     REGEX_ASSERT(*pat1 == *pat1);
2777     REGEX_ASSERT(*pat1 != pata);
2778
2779     // Assign
2780     patb = *pat1;
2781     REGEX_ASSERT(patb == *pat1);
2782
2783     // Copy Construct
2784     RegexPattern patc(*pat1);
2785     REGEX_ASSERT(patc == *pat1);
2786     REGEX_ASSERT(patb == patc);
2787     REGEX_ASSERT(pat1 != pat2);
2788     patb = *pat2;
2789     REGEX_ASSERT(patb != patc);
2790     REGEX_ASSERT(patb == *pat2);
2791
2792     // Compile with no flags.
2793     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2794     REGEX_ASSERT(*pat1a == *pat1);
2795
2796     REGEX_ASSERT(pat1a->flags() == 0);
2797
2798     // Compile with different flags should be not equal
2799     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800     REGEX_CHECK_STATUS;
2801
2802     REGEX_ASSERT(*pat1b != *pat1a);
2803     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804     REGEX_ASSERT(pat1a->flags() == 0);
2805     delete pat1b;
2806
2807     // clone
2808     RegexPattern *pat1c = pat1->clone();
2809     REGEX_ASSERT(*pat1c == *pat1);
2810     REGEX_ASSERT(*pat1c != *pat2);
2811
2812     delete pat1c;
2813     delete pat1a;
2814     delete pat1;
2815     delete pat2;
2816
2817     utext_close(&re1);
2818     utext_close(&re2);
2819
2820
2821     //
2822     //   Verify that a matcher created from a cloned pattern works.
2823     //     (Jitterbug 3423)
2824     //
2825     {
2826         UErrorCode     status     = U_ZERO_ERROR;
2827         UText          pattern    = UTEXT_INITIALIZER;
2828         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829         utext_openUTF8(&pattern, str_pL, -1, &status);
2830
2831         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2832         RegexPattern  *pClone     = pSource->clone();
2833         delete         pSource;
2834         RegexMatcher  *mFromClone = pClone->matcher(status);
2835         REGEX_CHECK_STATUS;
2836
2837         UText          input      = UTEXT_INITIALIZER;
2838         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840         mFromClone->reset(&input);
2841         REGEX_ASSERT(mFromClone->find() == TRUE);
2842         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843         REGEX_ASSERT(mFromClone->find() == TRUE);
2844         REGEX_ASSERT(mFromClone->group(status) == "World");
2845         REGEX_ASSERT(mFromClone->find() == FALSE);
2846         delete mFromClone;
2847         delete pClone;
2848
2849         utext_close(&input);
2850         utext_close(&pattern);
2851     }
2852
2853     //
2854     //   matches convenience API
2855     //
2856     {
2857         UErrorCode status  = U_ZERO_ERROR;
2858         UText      pattern = UTEXT_INITIALIZER;
2859         UText      input   = UTEXT_INITIALIZER;
2860
2861         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862         utext_openUTF8(&input, str_randominput, -1, &status);
2863
2864         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867         REGEX_CHECK_STATUS;
2868
2869         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870         utext_openUTF8(&pattern, str_abc, -1, &status);
2871         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872         REGEX_CHECK_STATUS;
2873
2874         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875         utext_openUTF8(&pattern, str_nput, -1, &status);
2876         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877         REGEX_CHECK_STATUS;
2878
2879         utext_openUTF8(&pattern, str_randominput, -1, &status);
2880         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881         REGEX_CHECK_STATUS;
2882
2883         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884         utext_openUTF8(&pattern, str_u, -1, &status);
2885         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886         REGEX_CHECK_STATUS;
2887
2888         utext_openUTF8(&input, str_abc, -1, &status);
2889         utext_openUTF8(&pattern, str_abc, -1, &status);
2890         status = U_INDEX_OUTOFBOUNDS_ERROR;
2891         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893
2894         utext_close(&input);
2895         utext_close(&pattern);
2896     }
2897
2898
2899     //
2900     // Split()
2901     //
2902     status = U_ZERO_ERROR;
2903     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2904     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905     pat1 = RegexPattern::compile(&re1, pe, status);
2906     REGEX_CHECK_STATUS;
2907     UnicodeString  fields[10];
2908
2909     int32_t n;
2910     n = pat1->split("Now is the time", fields, 10, status);
2911     REGEX_CHECK_STATUS;
2912     REGEX_ASSERT(n==4);
2913     REGEX_ASSERT(fields[0]=="Now");
2914     REGEX_ASSERT(fields[1]=="is");
2915     REGEX_ASSERT(fields[2]=="the");
2916     REGEX_ASSERT(fields[3]=="time");
2917     REGEX_ASSERT(fields[4]=="");
2918
2919     n = pat1->split("Now is the time", fields, 2, status);
2920     REGEX_CHECK_STATUS;
2921     REGEX_ASSERT(n==2);
2922     REGEX_ASSERT(fields[0]=="Now");
2923     REGEX_ASSERT(fields[1]=="is the time");
2924     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2925
2926     fields[1] = "*";
2927     status = U_ZERO_ERROR;
2928     n = pat1->split("Now is the time", fields, 1, status);
2929     REGEX_CHECK_STATUS;
2930     REGEX_ASSERT(n==1);
2931     REGEX_ASSERT(fields[0]=="Now is the time");
2932     REGEX_ASSERT(fields[1]=="*");
2933     status = U_ZERO_ERROR;
2934
2935     n = pat1->split("    Now       is the time   ", fields, 10, status);
2936     REGEX_CHECK_STATUS;
2937     REGEX_ASSERT(n==6);
2938     REGEX_ASSERT(fields[0]=="");
2939     REGEX_ASSERT(fields[1]=="Now");
2940     REGEX_ASSERT(fields[2]=="is");
2941     REGEX_ASSERT(fields[3]=="the");
2942     REGEX_ASSERT(fields[4]=="time");
2943     REGEX_ASSERT(fields[5]=="");
2944     REGEX_ASSERT(fields[6]=="");
2945
2946     fields[2] = "*";
2947     n = pat1->split("     ", fields, 10, status);
2948     REGEX_CHECK_STATUS;
2949     REGEX_ASSERT(n==2);
2950     REGEX_ASSERT(fields[0]=="");
2951     REGEX_ASSERT(fields[1]=="");
2952     REGEX_ASSERT(fields[2]=="*");
2953
2954     fields[0] = "foo";
2955     n = pat1->split("", fields, 10, status);
2956     REGEX_CHECK_STATUS;
2957     REGEX_ASSERT(n==0);
2958     REGEX_ASSERT(fields[0]=="foo");
2959
2960     delete pat1;
2961
2962     //  split, with a pattern with (capture)
2963     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964     pat1 = RegexPattern::compile(&re1,  pe, status);
2965     REGEX_CHECK_STATUS;
2966
2967     status = U_ZERO_ERROR;
2968     fields[6] = fields[7] = "*";
2969     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970     REGEX_CHECK_STATUS;
2971     REGEX_ASSERT(n==7);
2972     REGEX_ASSERT(fields[0]=="");
2973     REGEX_ASSERT(fields[1]=="a");
2974     REGEX_ASSERT(fields[2]=="Now is ");
2975     REGEX_ASSERT(fields[3]=="b");
2976     REGEX_ASSERT(fields[4]=="the time");
2977     REGEX_ASSERT(fields[5]=="c");
2978     REGEX_ASSERT(fields[6]=="");
2979     REGEX_ASSERT(fields[7]=="*");
2980     REGEX_ASSERT(status==U_ZERO_ERROR);
2981
2982     fields[6] = fields[7] = "*";
2983     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2984     REGEX_CHECK_STATUS;
2985     REGEX_ASSERT(n==7);
2986     REGEX_ASSERT(fields[0]=="  ");
2987     REGEX_ASSERT(fields[1]=="a");
2988     REGEX_ASSERT(fields[2]=="Now is ");
2989     REGEX_ASSERT(fields[3]=="b");
2990     REGEX_ASSERT(fields[4]=="the time");
2991     REGEX_ASSERT(fields[5]=="c");
2992     REGEX_ASSERT(fields[6]=="");
2993     REGEX_ASSERT(fields[7]=="*");
2994
2995     status = U_ZERO_ERROR;
2996     fields[6] = "foo";
2997     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2998     REGEX_CHECK_STATUS;
2999     REGEX_ASSERT(n==6);
3000     REGEX_ASSERT(fields[0]=="  ");
3001     REGEX_ASSERT(fields[1]=="a");
3002     REGEX_ASSERT(fields[2]=="Now is ");
3003     REGEX_ASSERT(fields[3]=="b");
3004     REGEX_ASSERT(fields[4]=="the time");
3005     REGEX_ASSERT(fields[5]==" ");
3006     REGEX_ASSERT(fields[6]=="foo");
3007
3008     status = U_ZERO_ERROR;
3009     fields[5] = "foo";
3010     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3011     REGEX_CHECK_STATUS;
3012     REGEX_ASSERT(n==5);
3013     REGEX_ASSERT(fields[0]=="  ");
3014     REGEX_ASSERT(fields[1]=="a");
3015     REGEX_ASSERT(fields[2]=="Now is ");
3016     REGEX_ASSERT(fields[3]=="b");
3017     REGEX_ASSERT(fields[4]=="the time<c>");
3018     REGEX_ASSERT(fields[5]=="foo");
3019
3020     status = U_ZERO_ERROR;
3021     fields[5] = "foo";
3022     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3023     REGEX_CHECK_STATUS;
3024     REGEX_ASSERT(n==5);
3025     REGEX_ASSERT(fields[0]=="  ");
3026     REGEX_ASSERT(fields[1]=="a");
3027     REGEX_ASSERT(fields[2]=="Now is ");
3028     REGEX_ASSERT(fields[3]=="b");
3029     REGEX_ASSERT(fields[4]=="the time");
3030     REGEX_ASSERT(fields[5]=="foo");
3031
3032     status = U_ZERO_ERROR;
3033     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3034     REGEX_CHECK_STATUS;
3035     REGEX_ASSERT(n==4);
3036     REGEX_ASSERT(fields[0]=="  ");
3037     REGEX_ASSERT(fields[1]=="a");
3038     REGEX_ASSERT(fields[2]=="Now is ");
3039     REGEX_ASSERT(fields[3]=="the time<c>");
3040     status = U_ZERO_ERROR;
3041     delete pat1;
3042
3043     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044     pat1 = RegexPattern::compile(&re1, pe, status);
3045     REGEX_CHECK_STATUS;
3046     n = pat1->split("1-10,20", fields, 10, status);
3047     REGEX_CHECK_STATUS;
3048     REGEX_ASSERT(n==5);
3049     REGEX_ASSERT(fields[0]=="1");
3050     REGEX_ASSERT(fields[1]=="-");
3051     REGEX_ASSERT(fields[2]=="10");
3052     REGEX_ASSERT(fields[3]==",");
3053     REGEX_ASSERT(fields[4]=="20");
3054     delete pat1;
3055
3056
3057     //
3058     // RegexPattern::pattern() and patternText()
3059     //
3060     pat1 = new RegexPattern();
3061     REGEX_ASSERT(pat1->pattern() == "");
3062     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3063     delete pat1;
3064     const char *helloWorldInvariant = "(Hello, world)*";
3065     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3066     pat1 = RegexPattern::compile(&re1, pe, status);
3067     REGEX_CHECK_STATUS;
3068     REGEX_ASSERT_UNISTR(pat1->pattern(),"(Hello, world)*");
3069     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3070     delete pat1;
3071
3072     utext_close(&re1);
3073 }
3074
3075
3076 //---------------------------------------------------------------------------
3077 //
3078 //      Extended       A more thorough check for features of regex patterns
3079 //                     The test cases are in a separate data file,
3080 //                       source/tests/testdata/regextst.txt
3081 //                     A description of the test data format is included in that file.
3082 //
3083 //---------------------------------------------------------------------------
3084
3085 const char *
3086 RegexTest::getPath(char buffer[2048], const char *filename) {
3087     UErrorCode status=U_ZERO_ERROR;
3088     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3089     if (U_FAILURE(status)) {
3090         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3091         return NULL;
3092     }
3093
3094     strcpy(buffer, testDataDirectory);
3095     strcat(buffer, filename);
3096     return buffer;
3097 }
3098
3099 void RegexTest::Extended() {
3100     char tdd[2048];
3101     const char *srcPath;
3102     UErrorCode  status  = U_ZERO_ERROR;
3103     int32_t     lineNum = 0;
3104
3105     //
3106     //  Open and read the test data file.
3107     //
3108     srcPath=getPath(tdd, "regextst.txt");
3109     if(srcPath==NULL) {
3110         return; /* something went wrong, error already output */
3111     }
3112
3113     int32_t    len;
3114     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3115     if (U_FAILURE(status)) {
3116         return; /* something went wrong, error already output */
3117     }
3118
3119     //
3120     //  Put the test data into a UnicodeString
3121     //
3122     UnicodeString testString(FALSE, testData, len);
3123
3124     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3125     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3126     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3127
3128     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3129     UnicodeString   testPattern;   // The pattern for test from the test file.
3130     UnicodeString   testFlags;     // the flags   for a test.
3131     UnicodeString   matchString;   // The marked up string to be used as input
3132
3133     if (U_FAILURE(status)){
3134         dataerrln("Construct RegexMatcher() error.");
3135         delete [] testData;
3136         return;
3137     }
3138
3139     //
3140     //  Loop over the test data file, once per line.
3141     //
3142     while (lineMat.find()) {
3143         lineNum++;
3144         if (U_FAILURE(status)) {
3145           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3146         }
3147
3148         status = U_ZERO_ERROR;
3149         UnicodeString testLine = lineMat.group(1, status);
3150         if (testLine.length() == 0) {
3151             continue;
3152         }
3153
3154         //
3155         // Parse the test line.  Skip blank and comment only lines.
3156         // Separate out the three main fields - pattern, flags, target.
3157         //
3158
3159         commentMat.reset(testLine);
3160         if (commentMat.lookingAt(status)) {
3161             // This line is a comment, or blank.
3162             continue;
3163         }
3164
3165         //
3166         //  Pull out the pattern field, remove it from the test file line.
3167         //
3168         quotedStuffMat.reset(testLine);
3169         if (quotedStuffMat.lookingAt(status)) {
3170             testPattern = quotedStuffMat.group(2, status);
3171             testLine.remove(0, quotedStuffMat.end(0, status));
3172         } else {
3173             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3174             continue;
3175         }
3176
3177
3178         //
3179         //  Pull out the flags from the test file line.
3180         //
3181         flagsMat.reset(testLine);
3182         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3183         testFlags = flagsMat.group(1, status);
3184         if (flagsMat.group(2, status).length() > 0) {
3185             errln("Bad Match flag at line %d. Scanning %c\n",
3186                 lineNum, flagsMat.group(2, status).charAt(0));
3187             continue;
3188         }
3189         testLine.remove(0, flagsMat.end(0, status));
3190
3191         //
3192         //  Pull out the match string, as a whole.
3193         //    We'll process the <tags> later.
3194         //
3195         quotedStuffMat.reset(testLine);
3196         if (quotedStuffMat.lookingAt(status)) {
3197             matchString = quotedStuffMat.group(2, status);
3198             testLine.remove(0, quotedStuffMat.end(0, status));
3199         } else {
3200             errln("Bad match string at test file line %d", lineNum);
3201             continue;
3202         }
3203
3204         //
3205         //  The only thing left from the input line should be an optional trailing comment.
3206         //
3207         commentMat.reset(testLine);
3208         if (commentMat.lookingAt(status) == FALSE) {
3209             errln("Line %d: unexpected characters at end of test line.", lineNum);
3210             continue;
3211         }
3212
3213         //
3214         //  Run the test
3215         //
3216         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3217     }
3218
3219     delete [] testData;
3220
3221 }
3222
3223
3224
3225 //---------------------------------------------------------------------------
3226 //
3227 //    regex_find(pattern, flags, inputString, lineNumber)
3228 //
3229 //         Function to run a single test from the Extended (data driven) tests.
3230 //         See file test/testdata/regextst.txt for a description of the
3231 //         pattern and inputString fields, and the allowed flags.
3232 //         lineNumber is the source line in regextst.txt of the test.
3233 //
3234 //---------------------------------------------------------------------------
3235
3236
3237 //  Set a value into a UVector at position specified by a decimal number in
3238 //   a UnicodeString.   This is a utility function needed by the actual test function,
3239 //   which follows.
3240 static void set(UVector &vec, int32_t val, UnicodeString index) {
3241     UErrorCode  status=U_ZERO_ERROR;
3242     int32_t  idx = 0;
3243     for (int32_t i=0; i<index.length(); i++) {
3244         int32_t d=u_charDigitValue(index.charAt(i));
3245         if (d<0) {return;}
3246         idx = idx*10 + d;
3247     }
3248     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3249     vec.setElementAt(val, idx);
3250 }
3251
3252 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3253     UErrorCode  status=U_ZERO_ERROR;
3254     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3255     vec.setElementAt(val, idx);
3256 }
3257
3258 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3259 {
3260     UBool couldFind = TRUE;
3261     UTEXT_SETNATIVEINDEX(utext, 0);
3262     int32_t i = 0;
3263     while (i < unistrOffset) {
3264         UChar32 c = UTEXT_NEXT32(utext);
3265         if (c != U_SENTINEL) {
3266             i += U16_LENGTH(c);
3267         } else {
3268             couldFind = FALSE;
3269             break;
3270         }
3271     }
3272     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3273     return couldFind;
3274 }
3275
3276
3277 void RegexTest::regex_find(const UnicodeString &pattern,
3278                            const UnicodeString &flags,
3279                            const UnicodeString &inputString,
3280                            const char *srcPath,
3281                            int32_t line) {
3282     UnicodeString       unEscapedInput;
3283     UnicodeString       deTaggedInput;
3284
3285     int32_t             patternUTF8Length,      inputUTF8Length;
3286     char                *patternChars  = NULL, *inputChars = NULL;
3287     UText               patternText    = UTEXT_INITIALIZER;
3288     UText               inputText      = UTEXT_INITIALIZER;
3289     UConverter          *UTF8Converter = NULL;
3290
3291     UErrorCode          status         = U_ZERO_ERROR;
3292     UParseError         pe;
3293     RegexPattern        *parsePat      = NULL;
3294     RegexMatcher        *parseMatcher  = NULL;
3295     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3296     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3297     UVector             groupStarts(status);
3298     UVector             groupEnds(status);
3299     UVector             groupStartsUTF8(status);
3300     UVector             groupEndsUTF8(status);
3301     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3302     UBool               failed         = FALSE;
3303     int32_t             numFinds;
3304     int32_t             i;
3305     UBool               useMatchesFunc   = FALSE;
3306     UBool               useLookingAtFunc = FALSE;
3307     int32_t             regionStart      = -1;
3308     int32_t             regionEnd        = -1;
3309     int32_t             regionStartUTF8  = -1;
3310     int32_t             regionEndUTF8    = -1;
3311
3312
3313     //
3314     //  Compile the caller's pattern
3315     //
3316     uint32_t bflags = 0;
3317     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3318         bflags |= UREGEX_CASE_INSENSITIVE;
3319     }
3320     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3321         bflags |= UREGEX_COMMENTS;
3322     }
3323     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3324         bflags |= UREGEX_DOTALL;
3325     }
3326     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3327         bflags |= UREGEX_MULTILINE;
3328     }
3329
3330     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3331         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3332     }
3333     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3334         bflags |= UREGEX_UNIX_LINES;
3335     }
3336     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3337         bflags |= UREGEX_LITERAL;
3338     }
3339
3340
3341     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3342     if (status != U_ZERO_ERROR) {
3343         #if UCONFIG_NO_BREAK_ITERATION==1
3344         // 'v' test flag means that the test pattern should not compile if ICU was configured
3345         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3346         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3347             goto cleanupAndReturn;
3348         }
3349         #endif
3350         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3351             // Expected pattern compilation error.
3352             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3353                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3354             }
3355             goto cleanupAndReturn;
3356         } else {
3357             // Unexpected pattern compilation error.
3358             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3359             goto cleanupAndReturn;
3360         }
3361     }
3362
3363     UTF8Converter = ucnv_open("UTF8", &status);
3364     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3365
3366     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3367     status = U_ZERO_ERROR; // buffer overflow
3368     patternChars = new char[patternUTF8Length+1];
3369     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3370     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3371
3372     if (status == U_ZERO_ERROR) {
3373         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3374
3375         if (status != U_ZERO_ERROR) {
3376 #if UCONFIG_NO_BREAK_ITERATION==1
3377             // 'v' test flag means that the test pattern should not compile if ICU was configured
3378             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3379             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3380                 goto cleanupAndReturn;
3381             }
3382 #endif
3383             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3384                 // Expected pattern compilation error.
3385                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3386                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3387                 }
3388                 goto cleanupAndReturn;
3389             } else {
3390                 // Unexpected pattern compilation error.
3391                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3392                 goto cleanupAndReturn;
3393             }
3394         }
3395     }
3396
3397     if (UTF8Pattern == NULL) {
3398         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3399         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3400         status = U_ZERO_ERROR;
3401     }
3402
3403     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3404         RegexPatternDump(callerPattern);
3405     }
3406
3407     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3408         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3409         goto cleanupAndReturn;
3410     }
3411
3412
3413     //
3414     // Number of times find() should be called on the test string, default to 1
3415     //
3416     numFinds = 1;
3417     for (i=2; i<=9; i++) {
3418         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3419             if (numFinds != 1) {
3420                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3421                 goto cleanupAndReturn;
3422             }
3423             numFinds = i;
3424         }
3425     }
3426
3427     // 'M' flag.  Use matches() instead of find()
3428     if (flags.indexOf((UChar)0x4d) >= 0) {
3429         useMatchesFunc = TRUE;
3430     }
3431     if (flags.indexOf((UChar)0x4c) >= 0) {
3432         useLookingAtFunc = TRUE;
3433     }
3434
3435     //
3436     //  Find the tags in the input data, remove them, and record the group boundary
3437     //    positions.
3438     //
3439     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3440     REGEX_CHECK_STATUS_L(line);
3441
3442     unEscapedInput = inputString.unescape();
3443     parseMatcher = parsePat->matcher(unEscapedInput, status);
3444     REGEX_CHECK_STATUS_L(line);
3445     while(parseMatcher->find()) {
3446         parseMatcher->appendReplacement(deTaggedInput, "", status);
3447         REGEX_CHECK_STATUS;
3448         UnicodeString groupNum = parseMatcher->group(2, status);
3449         if (groupNum == "r") {
3450             // <r> or </r>, a region specification within the string
3451             if (parseMatcher->group(1, status) == "/") {
3452                 regionEnd = deTaggedInput.length();
3453             } else {
3454                 regionStart = deTaggedInput.length();
3455             }
3456         } else {
3457             // <digits> or </digits>, a group match boundary tag.
3458             if (parseMatcher->group(1, status) == "/") {
3459                 set(groupEnds, deTaggedInput.length(), groupNum);
3460             } else {
3461                 set(groupStarts, deTaggedInput.length(), groupNum);
3462             }
3463         }
3464     }
3465     parseMatcher->appendTail(deTaggedInput);
3466     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3467     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3468       errln("mismatched <r> tags");
3469       failed = TRUE;
3470       goto cleanupAndReturn;
3471     }
3472
3473     //
3474     //  Configure the matcher according to the flags specified with this test.
3475     //
3476     matcher = callerPattern->matcher(deTaggedInput, status);
3477     REGEX_CHECK_STATUS_L(line);
3478     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3479         matcher->setTrace(TRUE);
3480     }
3481
3482     if (UTF8Pattern != NULL) {
3483         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3484         status = U_ZERO_ERROR; // buffer overflow
3485         inputChars = new char[inputUTF8Length+1];
3486         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3487         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3488
3489         if (status == U_ZERO_ERROR) {
3490             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3491             REGEX_CHECK_STATUS_L(line);
3492         }
3493
3494         if (UTF8Matcher == NULL) {
3495             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3496           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3497             status = U_ZERO_ERROR;
3498         }
3499     }
3500
3501     //
3502     //  Generate native indices for UTF8 versions of region and capture group info
3503     //
3504     if (UTF8Matcher != NULL) {
3505         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3506         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3507
3508         //  Fill out the native index UVector info.
3509         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3510         for (i=0; i<groupStarts.size(); i++) {
3511             int32_t  start = groupStarts.elementAti(i);
3512             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3513             if (start >= 0) {
3514                 int32_t  startUTF8;
3515                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3516                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3517                     failed = TRUE;
3518                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3519                 }
3520                 setInt(groupStartsUTF8, startUTF8, i);
3521             }
3522
3523             int32_t  end = groupEnds.elementAti(i);
3524             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3525             if (end >= 0) {
3526                 int32_t  endUTF8;
3527                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3528                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3529                     failed = TRUE;
3530                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3531                 }
3532                 setInt(groupEndsUTF8, endUTF8, i);
3533             }
3534         }
3535     }
3536
3537     if (regionStart>=0) {
3538        matcher->region(regionStart, regionEnd, status);
3539        REGEX_CHECK_STATUS_L(line);
3540        if (UTF8Matcher != NULL) {
3541            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3542            REGEX_CHECK_STATUS_L(line);
3543        }
3544     }
3545     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3546         matcher->useAnchoringBounds(FALSE);
3547         if (UTF8Matcher != NULL) {
3548             UTF8Matcher->useAnchoringBounds(FALSE);
3549         }
3550     }
3551     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3552         matcher->useTransparentBounds(TRUE);
3553         if (UTF8Matcher != NULL) {
3554             UTF8Matcher->useTransparentBounds(TRUE);
3555         }
3556     }
3557
3558
3559
3560     //
3561     // Do a find on the de-tagged input using the caller's pattern
3562     //     TODO: error on count>1 and not find().
3563     //           error on both matches() and lookingAt().
3564     //
3565     for (i=0; i<numFinds; i++) {
3566         if (useMatchesFunc) {
3567             isMatch = matcher->matches(status);
3568             if (UTF8Matcher != NULL) {
3569                isUTF8Match = UTF8Matcher->matches(status);
3570             }
3571         } else  if (useLookingAtFunc) {
3572             isMatch = matcher->lookingAt(status);
3573             if (UTF8Matcher != NULL) {
3574                 isUTF8Match = UTF8Matcher->lookingAt(status);
3575             }
3576         } else {
3577             isMatch = matcher->find();
3578             if (UTF8Matcher != NULL) {
3579                 isUTF8Match = UTF8Matcher->find();
3580             }
3581         }
3582     }
3583     matcher->setTrace(FALSE);
3584
3585     //
3586     // Match up the groups from the find() with the groups from the tags
3587     //
3588
3589     // number of tags should match number of groups from find operation.
3590     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3591     //   G option in test means that capture group data is not available in the
3592     //     expected results, so the check needs to be suppressed.
3593     if (isMatch == FALSE && groupStarts.size() != 0) {
3594         dataerrln("Error at line %d:  Match expected, but none found.", line);
3595         failed = TRUE;
3596         goto cleanupAndReturn;
3597     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3598         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3599         failed = TRUE;
3600         goto cleanupAndReturn;
3601     }
3602
3603     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3604         // Only check for match / no match.  Don't check capture groups.
3605         if (isMatch && groupStarts.size() == 0) {
3606             errln("Error at line %d:  No match expected, but one found.", line);
3607             failed = TRUE;
3608         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3609             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3610             failed = TRUE;
3611         }
3612         goto cleanupAndReturn;
3613     }
3614
3615     REGEX_CHECK_STATUS_L(line);
3616     for (i=0; i<=matcher->groupCount(); i++) {
3617         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3618         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3619         if (matcher->start(i, status) != expectedStart) {
3620             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3621                 line, i, expectedStart, matcher->start(i, status));
3622             failed = TRUE;
3623             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3624         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3625             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3626                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3627             failed = TRUE;
3628             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3629         }
3630
3631         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3632         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3633         if (matcher->end(i, status) != expectedEnd) {
3634             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3635                 line, i, expectedEnd, matcher->end(i, status));
3636             failed = TRUE;
3637             // Error on end position;  keep going; real error is probably yet to come as group
3638             //   end positions work from end of the input data towards the front.
3639         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3640             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3641                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3642             failed = TRUE;
3643             // Error on end position;  keep going; real error is probably yet to come as group
3644             //   end positions work from end of the input data towards the front.
3645         }
3646     }
3647     if ( matcher->groupCount()+1 < groupStarts.size()) {
3648         errln("Error at line %d: Expected %d capture groups, found %d.",
3649             line, groupStarts.size()-1, matcher->groupCount());
3650         failed = TRUE;
3651         }
3652     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3653         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3654               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3655         failed = TRUE;
3656     }
3657
3658     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3659         matcher->requireEnd() == TRUE) {
3660         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3661         failed = TRUE;
3662     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3663         UTF8Matcher->requireEnd() == TRUE) {
3664         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3665         failed = TRUE;
3666     }
3667
3668     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3669         matcher->requireEnd() == FALSE) {
3670         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3671         failed = TRUE;
3672     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3673         UTF8Matcher->requireEnd() == FALSE) {
3674         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3675         failed = TRUE;
3676     }
3677
3678     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3679         matcher->hitEnd() == TRUE) {
3680         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3681         failed = TRUE;
3682     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3683                UTF8Matcher->hitEnd() == TRUE) {
3684         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3685         failed = TRUE;
3686     }
3687
3688     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3689         matcher->hitEnd() == FALSE) {
3690         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3691         failed = TRUE;
3692     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3693                UTF8Matcher->hitEnd() == FALSE) {
3694         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3695         failed = TRUE;
3696     }
3697
3698
3699 cleanupAndReturn:
3700     if (failed) {
3701         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3702             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3703         // callerPattern->dump();
3704     }
3705     delete parseMatcher;
3706     delete parsePat;
3707     delete UTF8Matcher;
3708     delete UTF8Pattern;
3709     delete matcher;
3710     delete callerPattern;
3711
3712     utext_close(&inputText);
3713     delete[] inputChars;
3714     utext_close(&patternText);
3715     delete[] patternChars;
3716     ucnv_close(UTF8Converter);
3717 }
3718
3719
3720
3721
3722 //---------------------------------------------------------------------------
3723 //
3724 //      Errors     Check for error handling in patterns.
3725 //
3726 //---------------------------------------------------------------------------
3727 void RegexTest::Errors() {
3728     // \escape sequences that aren't implemented yet.
3729     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3730
3731     // Missing close parentheses
3732     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3733     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3734     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3735
3736     // Extra close paren
3737     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3738     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3739     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3740
3741     // Look-ahead, Look-behind
3742     //  TODO:  add tests for unbounded length look-behinds.
3743     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3744
3745     // Attempt to use non-default flags
3746     {
3747         UParseError   pe;
3748         UErrorCode    status = U_ZERO_ERROR;
3749         int32_t       flags  = UREGEX_CANON_EQ |
3750                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3751                                UREGEX_MULTILINE;
3752         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3753         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3754         delete pat1;
3755     }
3756
3757
3758     // Quantifiers are allowed only after something that can be quantified.
3759     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3760     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3761     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3762
3763     // Mal-formed {min,max} quantifiers
3764     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3765     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3766     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3767     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3768     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3769     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3770     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3771     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3772     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3773
3774     // Ticket 5389
3775     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3776
3777     // Invalid Back Reference \0
3778     //    For ICU 3.8 and earlier
3779     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3780     //
3781     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3782
3783 }
3784
3785
3786 //-------------------------------------------------------------------------------
3787 //
3788 //  Read a text data file, convert it to UChars, and return the data
3789 //    in one big UChar * buffer, which the caller must delete.
3790 //
3791 //--------------------------------------------------------------------------------
3792 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3793                                      const char *defEncoding, UErrorCode &status) {
3794     UChar       *retPtr  = NULL;
3795     char        *fileBuf = NULL;
3796     UConverter* conv     = NULL;
3797     FILE        *f       = NULL;
3798
3799     ulen = 0;
3800     if (U_FAILURE(status)) {
3801         return retPtr;
3802     }
3803
3804     //
3805     //  Open the file.
3806     //
3807     f = fopen(fileName, "rb");
3808     if (f == 0) {
3809         dataerrln("Error opening test data file %s\n", fileName);
3810         status = U_FILE_ACCESS_ERROR;
3811         return NULL;
3812     }
3813     //
3814     //  Read it in
3815     //
3816     int32_t            fileSize;
3817     int32_t            amt_read;
3818
3819     fseek( f, 0, SEEK_END);
3820     fileSize = ftell(f);
3821     fileBuf = new char[fileSize];
3822     fseek(f, 0, SEEK_SET);
3823     amt_read = fread(fileBuf, 1, fileSize, f);
3824     if (amt_read != fileSize || fileSize <= 0) {
3825         errln("Error reading test data file.");
3826         goto cleanUpAndReturn;
3827     }
3828
3829     //
3830     // Look for a Unicode Signature (BOM) on the data just read
3831     //
3832     int32_t        signatureLength;
3833     const char *   fileBufC;
3834     const char*    encoding;
3835
3836     fileBufC = fileBuf;
3837     encoding = ucnv_detectUnicodeSignature(
3838         fileBuf, fileSize, &signatureLength, &status);
3839     if(encoding!=NULL ){
3840         fileBufC  += signatureLength;
3841         fileSize  -= signatureLength;
3842     } else {
3843         encoding = defEncoding;
3844         if (strcmp(encoding, "utf-8") == 0) {
3845             errln("file %s is missing its BOM", fileName);
3846         }
3847     }
3848
3849     //
3850     // Open a converter to take the rule file to UTF-16
3851     //
3852     conv = ucnv_open(encoding, &status);
3853     if (U_FAILURE(status)) {
3854         goto cleanUpAndReturn;
3855     }
3856
3857     //
3858     // Convert the rules to UChar.
3859     //  Preflight first to determine required buffer size.
3860     //
3861     ulen = ucnv_toUChars(conv,
3862         NULL,           //  dest,
3863         0,              //  destCapacity,
3864         fileBufC,
3865         fileSize,
3866         &status);
3867     if (status == U_BUFFER_OVERFLOW_ERROR) {
3868         // Buffer Overflow is expected from the preflight operation.
3869         status = U_ZERO_ERROR;
3870
3871         retPtr = new UChar[ulen+1];
3872         ucnv_toUChars(conv,
3873             retPtr,       //  dest,
3874             ulen+1,
3875             fileBufC,
3876             fileSize,
3877             &status);
3878     }
3879
3880 cleanUpAndReturn:
3881     fclose(f);
3882     delete[] fileBuf;
3883     ucnv_close(conv);
3884     if (U_FAILURE(status)) {
3885         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3886         delete []retPtr;
3887         retPtr = 0;
3888         ulen   = 0;
3889     };
3890     return retPtr;
3891 }
3892
3893
3894 //-------------------------------------------------------------------------------
3895 //
3896 //   PerlTests  - Run Perl's regular expression tests
3897 //                The input file for this test is re_tests, the standard regular
3898 //                expression test data distributed with the Perl source code.
3899 //
3900 //                Here is Perl's description of the test data file:
3901 //
3902 //        # The tests are in a separate file 't/op/re_tests'.
3903 //        # Each line in that file is a separate test.
3904 //        # There are five columns, separated by tabs.
3905 //        #
3906 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3907 //        # Modifiers can be put after the closing C<'>.
3908 //        #
3909 //        # Column 2 contains the string to be matched.
3910 //        #
3911 //        # Column 3 contains the expected result:
3912 //        #     y   expect a match
3913 //        #     n   expect no match
3914 //        #     c   expect an error
3915 //        # B   test exposes a known bug in Perl, should be skipped
3916 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3917 //        #
3918 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3919 //        #
3920 //        # Column 4 contains a string, usually C<$&>.
3921 //        #
3922 //        # Column 5 contains the expected result of double-quote
3923 //        # interpolating that string after the match, or start of error message.
3924 //        #
3925 //        # Column 6, if present, contains a reason why the test is skipped.
3926 //        # This is printed with "skipped", for harness to pick up.
3927 //        #
3928 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3929 //        #
3930 //        # If you want to add a regular expression test that can't be expressed
3931 //        # in this format, don't add it here: put it in op/pat.t instead.
3932 //
3933 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3934 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3935 //        (The i is in addition to whatever was there before.)
3936 //
3937 //-------------------------------------------------------------------------------
3938 void RegexTest::PerlTests() {
3939     char tdd[2048];
3940     const char *srcPath;
3941     UErrorCode  status = U_ZERO_ERROR;
3942     UParseError pe;
3943
3944     //
3945     //  Open and read the test data file.
3946     //
3947     srcPath=getPath(tdd, "re_tests.txt");
3948     if(srcPath==NULL) {
3949         return; /* something went wrong, error already output */
3950     }
3951
3952     int32_t    len;
3953     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3954     if (U_FAILURE(status)) {
3955         return; /* something went wrong, error already output */
3956     }
3957
3958     //
3959     //  Put the test data into a UnicodeString
3960     //
3961     UnicodeString testDataString(FALSE, testData, len);
3962
3963     //
3964     //  Regex to break the input file into lines, and strip the new lines.
3965     //     One line per match, capture group one is the desired data.
3966     //
3967     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
3968     if (U_FAILURE(status)) {
3969         dataerrln("RegexPattern::compile() error");
3970         return;
3971     }
3972     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
3973
3974     //
3975     //  Regex to split a test file line into fields.
3976     //    There are six fields, separated by tabs.
3977     //
3978     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
3979
3980     //
3981     //  Regex to identify test patterns with flag settings, and to separate them.
3982     //    Test patterns with flags look like 'pattern'i
3983     //    Test patterns without flags are not quoted:   pattern
3984     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
3985     //
3986     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
3987     RegexMatcher* flagMat = flagPat->matcher(status);
3988
3989     //
3990     // The Perl tests reference several perl-isms, which are evaluated/substituted
3991     //   in the test data.  Not being perl, this must be done explicitly.  Here
3992     //   are string constants and REs for these constructs.
3993     //
3994     UnicodeString nulnulSrc("${nulnul}");
3995     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
3996     nulnul = nulnul.unescape();
3997
3998     UnicodeString ffffSrc("${ffff}");
3999     UnicodeString ffff("\\uffff", -1, US_INV);
4000     ffff = ffff.unescape();
4001
4002     //  regexp for $-[0], $+[2], etc.
4003     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4004     RegexMatcher *groupsMat = groupsPat->matcher(status);
4005
4006     //  regexp for $0, $1, $2, etc.
4007     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4008     RegexMatcher *cgMat = cgPat->matcher(status);
4009
4010
4011     //
4012     // Main Loop for the Perl Tests, runs once per line from the
4013     //   test data file.
4014     //
4015     int32_t  lineNum = 0;
4016     int32_t  skippedUnimplementedCount = 0;
4017     while (lineMat->find()) {
4018         lineNum++;
4019
4020         //
4021         //  Get a line, break it into its fields, do the Perl
4022         //    variable substitutions.
4023         //
4024         UnicodeString line = lineMat->group(1, status);
4025         UnicodeString fields[7];
4026         fieldPat->split(line, fields, 7, status);
4027
4028         flagMat->reset(fields[0]);
4029         flagMat->matches(status);
4030         UnicodeString pattern  = flagMat->group(2, status);
4031         pattern.findAndReplace("${bang}", "!");
4032         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4033         pattern.findAndReplace(ffffSrc, ffff);
4034
4035         //
4036         //  Identify patterns that include match flag settings,
4037         //    split off the flags, remove the extra quotes.
4038         //
4039         UnicodeString flagStr = flagMat->group(3, status);
4040         if (U_FAILURE(status)) {
4041             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4042             return;
4043         }
4044         int32_t flags = 0;
4045         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4046         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4047         const UChar UChar_m = 0x6d;
4048         const UChar UChar_x = 0x78;
4049         const UChar UChar_y = 0x79;
4050         if (flagStr.indexOf(UChar_i) != -1) {
4051             flags |= UREGEX_CASE_INSENSITIVE;
4052         }
4053         if (flagStr.indexOf(UChar_m) != -1) {
4054             flags |= UREGEX_MULTILINE;
4055         }
4056         if (flagStr.indexOf(UChar_x) != -1) {
4057             flags |= UREGEX_COMMENTS;
4058         }
4059
4060         //
4061         // Compile the test pattern.
4062         //
4063         status = U_ZERO_ERROR;
4064         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4065         if (status == U_REGEX_UNIMPLEMENTED) {
4066             //
4067             // Test of a feature that is planned for ICU, but not yet implemented.
4068             //   skip the test.
4069             skippedUnimplementedCount++;
4070             delete testPat;
4071             status = U_ZERO_ERROR;
4072             continue;
4073         }
4074
4075         if (U_FAILURE(status)) {
4076             // Some tests are supposed to generate errors.
4077             //   Only report an error for tests that are supposed to succeed.
4078             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4079                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4080             {
4081                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4082             }
4083             status = U_ZERO_ERROR;
4084             delete testPat;
4085             continue;
4086         }
4087
4088         if (fields[2].indexOf(UChar_i) >= 0) {
4089             // ICU should skip this test.
4090             delete testPat;
4091             continue;
4092         }
4093
4094         if (fields[2].indexOf(UChar_c) >= 0) {
4095             // This pattern should have caused a compilation error, but didn't/
4096             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4097             delete testPat;
4098             continue;
4099         }
4100
4101         //
4102         // replace the Perl variables that appear in some of the
4103         //   match data strings.
4104         //
4105         UnicodeString matchString = fields[1];
4106         matchString.findAndReplace(nulnulSrc, nulnul);
4107         matchString.findAndReplace(ffffSrc,   ffff);
4108
4109         // Replace any \n in the match string with an actual new-line char.
4110         //  Don't do full unescape, as this unescapes more than Perl does, which
4111         //  causes other spurious failures in the tests.
4112         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4113
4114
4115
4116         //
4117         // Run the test, check for expected match/don't match result.
4118         //
4119         RegexMatcher *testMat = testPat->matcher(matchString, status);
4120         UBool found = testMat->find();
4121         UBool expected = FALSE;
4122         if (fields[2].indexOf(UChar_y) >=0) {
4123             expected = TRUE;
4124         }
4125         if (expected != found) {
4126             errln("line %d: Expected %smatch, got %smatch",
4127                 lineNum, expected?"":"no ", found?"":"no " );
4128             continue;
4129         }
4130
4131         // Don't try to check expected results if there is no match.
4132         //   (Some have stuff in the expected fields)
4133         if (!found) {
4134             delete testMat;
4135             delete testPat;
4136             continue;
4137         }
4138
4139         //
4140         // Interpret the Perl expression from the fourth field of the data file,
4141         // building up an ICU string from the results of the ICU match.
4142         //   The Perl expression will contain references to the results of
4143         //     a regex match, including the matched string, capture group strings,
4144         //     group starting and ending indicies, etc.
4145         //
4146         UnicodeString resultString;
4147         UnicodeString perlExpr = fields[3];
4148 #if SUPPORT_MUTATING_INPUT_STRING
4149         groupsMat->reset(perlExpr);
4150         cgMat->reset(perlExpr);
4151 #endif
4152
4153         while (perlExpr.length() > 0) {
4154 #if !SUPPORT_MUTATING_INPUT_STRING
4155             //  Perferred usage.  Reset after any modification to input string.
4156             groupsMat->reset(perlExpr);
4157             cgMat->reset(perlExpr);
4158 #endif
4159
4160             if (perlExpr.startsWith("$&")) {
4161                 resultString.append(testMat->group(status));
4162                 perlExpr.remove(0, 2);
4163             }
4164
4165             else if (groupsMat->lookingAt(status)) {
4166                 // $-[0]   $+[2]  etc.
4167                 UnicodeString digitString = groupsMat->group(2, status);
4168                 int32_t t = 0;
4169                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4170                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4171                 int32_t matchPosition;
4172                 if (plusOrMinus.compare("+") == 0) {
4173                     matchPosition = testMat->end(groupNum, status);
4174                 } else {
4175                     matchPosition = testMat->start(groupNum, status);
4176                 }
4177                 if (matchPosition != -1) {
4178                     ICU_Utility::appendNumber(resultString, matchPosition);
4179                 }
4180                 perlExpr.remove(0, groupsMat->end(status));
4181             }
4182
4183             else if (cgMat->lookingAt(status)) {
4184                 // $1, $2, $3, etc.
4185                 UnicodeString digitString = cgMat->group(1, status);
4186                 int32_t t = 0;
4187                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4188                 if (U_SUCCESS(status)) {
4189                     resultString.append(testMat->group(groupNum, status));
4190                     status = U_ZERO_ERROR;
4191                 }
4192                 perlExpr.remove(0, cgMat->end(status));
4193             }
4194
4195             else if (perlExpr.startsWith("@-")) {
4196                 int32_t i;
4197                 for (i=0; i<=testMat->groupCount(); i++) {
4198                     if (i>0) {
4199                         resultString.append(" ");
4200                     }
4201                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4202                 }
4203                 perlExpr.remove(0, 2);
4204             }
4205
4206             else if (perlExpr.startsWith("@+")) {
4207                 int32_t i;
4208                 for (i=0; i<=testMat->groupCount(); i++) {
4209                     if (i>0) {
4210                         resultString.append(" ");
4211                     }
4212                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4213                 }
4214                 perlExpr.remove(0, 2);
4215             }
4216
4217             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4218                                                      //           or as an escaped sequence (e.g. \n)
4219                 if (perlExpr.length() > 1) {
4220                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4221                 }
4222                 UChar c = perlExpr.charAt(0);
4223                 switch (c) {
4224                 case 'n':   c = '\n'; break;
4225                 // add any other escape sequences that show up in the test expected results.
4226                 }
4227                 resultString.append(c);
4228                 perlExpr.remove(0, 1);
4229             }
4230
4231             else  {
4232                 // Any characters from the perl expression that we don't explicitly
4233                 //  recognize before here are assumed to be literals and copied
4234                 //  as-is to the expected results.
4235                 resultString.append(perlExpr.charAt(0));
4236                 perlExpr.remove(0, 1);
4237             }
4238
4239             if (U_FAILURE(status)) {
4240                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4241                 break;
4242             }
4243         }
4244
4245         //
4246         // Expected Results Compare
4247         //
4248         UnicodeString expectedS(fields[4]);
4249         expectedS.findAndReplace(nulnulSrc, nulnul);
4250         expectedS.findAndReplace(ffffSrc,   ffff);
4251         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4252
4253
4254         if (expectedS.compare(resultString) != 0) {
4255             err("Line %d: Incorrect perl expression results.", lineNum);
4256             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4257         }
4258
4259         delete testMat;
4260         delete testPat;
4261     }
4262
4263     //
4264     // All done.  Clean up allocated stuff.
4265     //
4266     delete cgMat;
4267     delete cgPat;
4268
4269     delete groupsMat;
4270     delete groupsPat;
4271
4272     delete flagMat;
4273     delete flagPat;
4274
4275     delete lineMat;
4276     delete linePat;
4277
4278     delete fieldPat;
4279     delete [] testData;
4280
4281
4282     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4283
4284 }
4285
4286
4287 //-------------------------------------------------------------------------------
4288 //
4289 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4290 //                  (instead of using UnicodeStrings) to test the alternate engine.
4291 //                  The input file for this test is re_tests, the standard regular
4292 //                  expression test data distributed with the Perl source code.
4293 //                  See PerlTests() for more information.
4294 //
4295 //-------------------------------------------------------------------------------
4296 void RegexTest::PerlTestsUTF8() {
4297     char tdd[2048];
4298     const char *srcPath;
4299     UErrorCode  status = U_ZERO_ERROR;
4300     UParseError pe;
4301     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4302     UText       patternText = UTEXT_INITIALIZER;
4303     char       *patternChars = NULL;
4304     int32_t     patternLength;
4305     int32_t     patternCapacity = 0;
4306     UText       inputText = UTEXT_INITIALIZER;
4307     char       *inputChars = NULL;
4308     int32_t     inputLength;
4309     int32_t     inputCapacity = 0;
4310
4311     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4312
4313     //
4314     //  Open and read the test data file.
4315     //
4316     srcPath=getPath(tdd, "re_tests.txt");
4317     if(srcPath==NULL) {
4318         return; /* something went wrong, error already output */
4319     }
4320
4321     int32_t    len;
4322     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4323     if (U_FAILURE(status)) {
4324         return; /* something went wrong, error already output */
4325     }
4326
4327     //
4328     //  Put the test data into a UnicodeString
4329     //
4330     UnicodeString testDataString(FALSE, testData, len);
4331
4332     //
4333     //  Regex to break the input file into lines, and strip the new lines.
4334     //     One line per match, capture group one is the desired data.
4335     //
4336     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4337     if (U_FAILURE(status)) {
4338         dataerrln("RegexPattern::compile() error");
4339         return;
4340     }
4341     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4342
4343     //
4344     //  Regex to split a test file line into fields.
4345     //    There are six fields, separated by tabs.
4346     //
4347     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4348
4349     //
4350     //  Regex to identify test patterns with flag settings, and to separate them.
4351     //    Test patterns with flags look like 'pattern'i
4352     //    Test patterns without flags are not quoted:   pattern
4353     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4354     //
4355     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4356     RegexMatcher* flagMat = flagPat->matcher(status);
4357
4358     //
4359     // The Perl tests reference several perl-isms, which are evaluated/substituted
4360     //   in the test data.  Not being perl, this must be done explicitly.  Here
4361     //   are string constants and REs for these constructs.
4362     //
4363     UnicodeString nulnulSrc("${nulnul}");
4364     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4365     nulnul = nulnul.unescape();
4366
4367     UnicodeString ffffSrc("${ffff}");
4368     UnicodeString ffff("\\uffff", -1, US_INV);
4369     ffff = ffff.unescape();
4370
4371     //  regexp for $-[0], $+[2], etc.
4372     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4373     RegexMatcher *groupsMat = groupsPat->matcher(status);
4374
4375     //  regexp for $0, $1, $2, etc.
4376     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4377     RegexMatcher *cgMat = cgPat->matcher(status);
4378
4379
4380     //
4381     // Main Loop for the Perl Tests, runs once per line from the
4382     //   test data file.
4383     //
4384     int32_t  lineNum = 0;
4385     int32_t  skippedUnimplementedCount = 0;
4386     while (lineMat->find()) {
4387         lineNum++;
4388
4389         //
4390         //  Get a line, break it into its fields, do the Perl
4391         //    variable substitutions.
4392         //
4393         UnicodeString line = lineMat->group(1, status);
4394         UnicodeString fields[7];
4395         fieldPat->split(line, fields, 7, status);
4396
4397         flagMat->reset(fields[0]);
4398         flagMat->matches(status);
4399         UnicodeString pattern  = flagMat->group(2, status);
4400         pattern.findAndReplace("${bang}", "!");
4401         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4402         pattern.findAndReplace(ffffSrc, ffff);
4403
4404         //
4405         //  Identify patterns that include match flag settings,
4406         //    split off the flags, remove the extra quotes.
4407         //
4408         UnicodeString flagStr = flagMat->group(3, status);
4409         if (U_FAILURE(status)) {
4410             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4411             return;
4412         }
4413         int32_t flags = 0;
4414         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4415         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4416         const UChar UChar_m = 0x6d;
4417         const UChar UChar_x = 0x78;
4418         const UChar UChar_y = 0x79;
4419         if (flagStr.indexOf(UChar_i) != -1) {
4420             flags |= UREGEX_CASE_INSENSITIVE;
4421         }
4422         if (flagStr.indexOf(UChar_m) != -1) {
4423             flags |= UREGEX_MULTILINE;
4424         }
4425         if (flagStr.indexOf(UChar_x) != -1) {
4426             flags |= UREGEX_COMMENTS;
4427         }
4428
4429         //
4430         // Put the pattern in a UTF-8 UText
4431         //
4432         status = U_ZERO_ERROR;
4433         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4434         if (status == U_BUFFER_OVERFLOW_ERROR) {
4435             status = U_ZERO_ERROR;
4436             delete[] patternChars;
4437             patternCapacity = patternLength + 1;
4438             patternChars = new char[patternCapacity];
4439             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4440         }
4441         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4442
4443         //
4444         // Compile the test pattern.
4445         //
4446         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4447         if (status == U_REGEX_UNIMPLEMENTED) {
4448             //
4449             // Test of a feature that is planned for ICU, but not yet implemented.
4450             //   skip the test.
4451             skippedUnimplementedCount++;
4452             delete testPat;
4453             status = U_ZERO_ERROR;
4454             continue;
4455         }
4456
4457         if (U_FAILURE(status)) {
4458             // Some tests are supposed to generate errors.
4459             //   Only report an error for tests that are supposed to succeed.
4460             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4461                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4462             {
4463                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4464             }
4465             status = U_ZERO_ERROR;
4466             delete testPat;
4467             continue;
4468         }
4469
4470         if (fields[2].indexOf(UChar_i) >= 0) {
4471             // ICU should skip this test.
4472             delete testPat;
4473             continue;
4474         }
4475
4476         if (fields[2].indexOf(UChar_c) >= 0) {
4477             // This pattern should have caused a compilation error, but didn't/
4478             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4479             delete testPat;
4480             continue;
4481         }
4482
4483
4484         //
4485         // replace the Perl variables that appear in some of the
4486         //   match data strings.
4487         //
4488         UnicodeString matchString = fields[1];
4489         matchString.findAndReplace(nulnulSrc, nulnul);
4490         matchString.findAndReplace(ffffSrc,   ffff);
4491
4492         // Replace any \n in the match string with an actual new-line char.
4493         //  Don't do full unescape, as this unescapes more than Perl does, which
4494         //  causes other spurious failures in the tests.
4495         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4496
4497         //
4498         // Put the input in a UTF-8 UText
4499         //
4500         status = U_ZERO_ERROR;
4501         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4502         if (status == U_BUFFER_OVERFLOW_ERROR) {
4503             status = U_ZERO_ERROR;
4504             delete[] inputChars;
4505             inputCapacity = inputLength + 1;
4506             inputChars = new char[inputCapacity];
4507             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4508         }
4509         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4510
4511         //
4512         // Run the test, check for expected match/don't match result.
4513         //
4514         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4515         UBool found = testMat->find();
4516         UBool expected = FALSE;
4517         if (fields[2].indexOf(UChar_y) >=0) {
4518             expected = TRUE;
4519         }
4520         if (expected != found) {
4521             errln("line %d: Expected %smatch, got %smatch",
4522                 lineNum, expected?"":"no ", found?"":"no " );
4523             continue;
4524         }
4525
4526         // Don't try to check expected results if there is no match.
4527         //   (Some have stuff in the expected fields)
4528         if (!found) {
4529             delete testMat;
4530             delete testPat;
4531             continue;
4532         }
4533
4534         //
4535         // Interpret the Perl expression from the fourth field of the data file,
4536         // building up an ICU string from the results of the ICU match.
4537         //   The Perl expression will contain references to the results of
4538         //     a regex match, including the matched string, capture group strings,
4539         //     group starting and ending indicies, etc.
4540         //
4541         UnicodeString resultString;
4542         UnicodeString perlExpr = fields[3];
4543
4544         while (perlExpr.length() > 0) {
4545             groupsMat->reset(perlExpr);
4546             cgMat->reset(perlExpr);
4547
4548             if (perlExpr.startsWith("$&")) {
4549                 resultString.append(testMat->group(status));
4550                 perlExpr.remove(0, 2);
4551             }
4552
4553             else if (groupsMat->lookingAt(status)) {
4554                 // $-[0]   $+[2]  etc.
4555                 UnicodeString digitString = groupsMat->group(2, status);
4556                 int32_t t = 0;
4557                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4558                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4559                 int32_t matchPosition;
4560                 if (plusOrMinus.compare("+") == 0) {
4561                     matchPosition = testMat->end(groupNum, status);
4562                 } else {
4563                     matchPosition = testMat->start(groupNum, status);
4564                 }
4565                 if (matchPosition != -1) {
4566                     ICU_Utility::appendNumber(resultString, matchPosition);
4567                 }
4568                 perlExpr.remove(0, groupsMat->end(status));
4569             }
4570
4571             else if (cgMat->lookingAt(status)) {
4572                 // $1, $2, $3, etc.
4573                 UnicodeString digitString = cgMat->group(1, status);
4574                 int32_t t = 0;
4575                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4576                 if (U_SUCCESS(status)) {
4577                     resultString.append(testMat->group(groupNum, status));
4578                     status = U_ZERO_ERROR;
4579                 }
4580                 perlExpr.remove(0, cgMat->end(status));
4581             }
4582
4583             else if (perlExpr.startsWith("@-")) {
4584                 int32_t i;
4585                 for (i=0; i<=testMat->groupCount(); i++) {
4586                     if (i>0) {
4587                         resultString.append(" ");
4588                     }
4589                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4590                 }
4591                 perlExpr.remove(0, 2);
4592             }
4593
4594             else if (perlExpr.startsWith("@+")) {
4595                 int32_t i;
4596                 for (i=0; i<=testMat->groupCount(); i++) {
4597                     if (i>0) {
4598                         resultString.append(" ");
4599                     }
4600                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4601                 }
4602                 perlExpr.remove(0, 2);
4603             }
4604
4605             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4606                                                      //           or as an escaped sequence (e.g. \n)
4607                 if (perlExpr.length() > 1) {
4608                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4609                 }
4610                 UChar c = perlExpr.charAt(0);
4611                 switch (c) {
4612                 case 'n':   c = '\n'; break;
4613                 // add any other escape sequences that show up in the test expected results.
4614                 }
4615                 resultString.append(c);
4616                 perlExpr.remove(0, 1);
4617             }
4618
4619             else  {
4620                 // Any characters from the perl expression that we don't explicitly
4621                 //  recognize before here are assumed to be literals and copied
4622                 //  as-is to the expected results.
4623                 resultString.append(perlExpr.charAt(0));
4624                 perlExpr.remove(0, 1);
4625             }
4626
4627             if (U_FAILURE(status)) {
4628                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4629                 break;
4630             }
4631         }
4632
4633         //
4634         // Expected Results Compare
4635         //
4636         UnicodeString expectedS(fields[4]);
4637         expectedS.findAndReplace(nulnulSrc, nulnul);
4638         expectedS.findAndReplace(ffffSrc,   ffff);
4639         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4640
4641
4642         if (expectedS.compare(resultString) != 0) {
4643             err("Line %d: Incorrect perl expression results.", lineNum);
4644             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4645         }
4646
4647         delete testMat;
4648         delete testPat;
4649     }
4650
4651     //
4652     // All done.  Clean up allocated stuff.
4653     //
4654     delete cgMat;
4655     delete cgPat;
4656
4657     delete groupsMat;
4658     delete groupsPat;
4659
4660     delete flagMat;
4661     delete flagPat;
4662
4663     delete lineMat;
4664     delete linePat;
4665
4666     delete fieldPat;
4667     delete [] testData;
4668
4669     utext_close(&patternText);
4670     utext_close(&inputText);
4671
4672     delete [] patternChars;
4673     delete [] inputChars;
4674
4675
4676     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4677
4678 }
4679
4680
4681 //--------------------------------------------------------------
4682 //
4683 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4684 //             Use this pattern,
4685 //                 "(a?){1,}"
4686 //             The zero-length match will repeat forever.
4687 //                (That this goes into a loop is another bug)
4688 //
4689 //---------------------------------------------------------------
4690 void RegexTest::Bug6149() {
4691     UnicodeString pattern("(a?){1,}");
4692     UnicodeString s("xyz");
4693     uint32_t flags = 0;
4694     UErrorCode status = U_ZERO_ERROR;
4695
4696     RegexMatcher  matcher(pattern, s, flags, status);
4697     UBool result = false;
4698     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4699     REGEX_ASSERT(result == FALSE);
4700  }
4701
4702
4703 //
4704 //   Callbacks()    Test the callback function.
4705 //                  When set, callbacks occur periodically during matching operations,
4706 //                  giving the application code the ability to abort the operation
4707 //                  before it's normal completion.
4708 //
4709
4710 struct callBackContext {
4711     RegexTest        *test;
4712     int32_t          maxCalls;
4713     int32_t          numCalls;
4714     int32_t          lastSteps;
4715     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4716 };
4717
4718 U_CDECL_BEGIN
4719 static UBool U_CALLCONV
4720 testCallBackFn(const void *context, int32_t steps) {
4721     callBackContext  *info = (callBackContext *)context;
4722     if (info->lastSteps+1 != steps) {
4723         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4724     }
4725     info->lastSteps = steps;
4726     info->numCalls++;
4727     return (info->numCalls < info->maxCalls);
4728 }
4729 U_CDECL_END
4730
4731 void RegexTest::Callbacks() {
4732    {
4733         // Getter returns NULLs if no callback has been set
4734
4735         //   The variables that the getter will fill in.
4736         //   Init to non-null values so that the action of the getter can be seen.
4737         const void          *returnedContext = &returnedContext;
4738         URegexMatchCallback *returnedFn = &testCallBackFn;
4739
4740         UErrorCode status = U_ZERO_ERROR;
4741         RegexMatcher matcher("x", 0, status);
4742         REGEX_CHECK_STATUS;
4743         matcher.getMatchCallback(returnedFn, returnedContext, status);
4744         REGEX_CHECK_STATUS;
4745         REGEX_ASSERT(returnedFn == NULL);
4746         REGEX_ASSERT(returnedContext == NULL);
4747     }
4748
4749    {
4750         // Set and Get work
4751         callBackContext cbInfo = {this, 0, 0, 0};
4752         const void          *returnedContext;
4753         URegexMatchCallback *returnedFn;
4754         UErrorCode status = U_ZERO_ERROR;
4755         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4756         REGEX_CHECK_STATUS;
4757         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4758         REGEX_CHECK_STATUS;
4759         matcher.getMatchCallback(returnedFn, returnedContext, status);
4760         REGEX_CHECK_STATUS;
4761         REGEX_ASSERT(returnedFn == testCallBackFn);
4762         REGEX_ASSERT(returnedContext == &cbInfo);
4763
4764         // A short-running match shouldn't invoke the callback
4765         status = U_ZERO_ERROR;
4766         cbInfo.reset(1);
4767         UnicodeString s = "xxx";
4768         matcher.reset(s);
4769         REGEX_ASSERT(matcher.matches(status));
4770         REGEX_CHECK_STATUS;
4771         REGEX_ASSERT(cbInfo.numCalls == 0);
4772
4773         // A medium-length match that runs long enough to invoke the
4774         //   callback, but not so long that the callback aborts it.
4775         status = U_ZERO_ERROR;
4776         cbInfo.reset(4);
4777         s = "aaaaaaaaaaaaaaaaaaab";
4778         matcher.reset(s);
4779         REGEX_ASSERT(matcher.matches(status)==FALSE);
4780         REGEX_CHECK_STATUS;
4781         REGEX_ASSERT(cbInfo.numCalls > 0);
4782
4783         // A longer running match that the callback function will abort.
4784         status = U_ZERO_ERROR;
4785         cbInfo.reset(4);
4786         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4787         matcher.reset(s);
4788         REGEX_ASSERT(matcher.matches(status)==FALSE);
4789         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4790         REGEX_ASSERT(cbInfo.numCalls == 4);
4791     }
4792
4793
4794 }
4795
4796
4797 //
4798 //   FindProgressCallbacks()    Test the find "progress" callback function.
4799 //                  When set, the find progress callback will be invoked during a find operations
4800 //                  after each return from a match attempt, giving the application the opportunity
4801 //                  to terminate a long-running find operation before it's normal completion.
4802 //
4803
4804 struct progressCallBackContext {
4805     RegexTest        *test;
4806     int64_t          lastIndex;
4807     int32_t          maxCalls;
4808     int32_t          numCalls;
4809     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4810 };
4811
4812 U_CDECL_BEGIN
4813 static UBool U_CALLCONV
4814 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4815     progressCallBackContext  *info = (progressCallBackContext *)context;
4816     info->numCalls++;
4817     info->lastIndex = matchIndex;
4818 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4819     return (info->numCalls < info->maxCalls);
4820 }
4821 U_CDECL_END
4822
4823 void RegexTest::FindProgressCallbacks() {
4824    {
4825         // Getter returns NULLs if no callback has been set
4826
4827         //   The variables that the getter will fill in.
4828         //   Init to non-null values so that the action of the getter can be seen.
4829         const void                  *returnedContext = &returnedContext;
4830         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4831
4832         UErrorCode status = U_ZERO_ERROR;
4833         RegexMatcher matcher("x", 0, status);
4834         REGEX_CHECK_STATUS;
4835         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4836         REGEX_CHECK_STATUS;
4837         REGEX_ASSERT(returnedFn == NULL);
4838         REGEX_ASSERT(returnedContext == NULL);
4839     }
4840
4841    {
4842         // Set and Get work
4843         progressCallBackContext cbInfo = {this, 0, 0, 0};
4844         const void                  *returnedContext;
4845         URegexFindProgressCallback  *returnedFn;
4846         UErrorCode status = U_ZERO_ERROR;
4847         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4848         REGEX_CHECK_STATUS;
4849         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4850         REGEX_CHECK_STATUS;
4851         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4852         REGEX_CHECK_STATUS;
4853         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4854         REGEX_ASSERT(returnedContext == &cbInfo);
4855
4856         // A short-running match should NOT invoke the callback.
4857         status = U_ZERO_ERROR;
4858         cbInfo.reset(100);
4859         UnicodeString s = "abxxx";
4860         matcher.reset(s);
4861 #if 0
4862         matcher.setTrace(TRUE);
4863 #endif
4864         REGEX_ASSERT(matcher.find(0, status));
4865         REGEX_CHECK_STATUS;
4866         REGEX_ASSERT(cbInfo.numCalls == 0);
4867
4868         // A medium running match that causes matcher.find() to invoke our callback for each index.
4869         status = U_ZERO_ERROR;
4870         s = "aaaaaaaaaaaaaaaaaaab";
4871         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4872         matcher.reset(s);
4873         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4874         REGEX_CHECK_STATUS;
4875         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4876
4877         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4878         status = U_ZERO_ERROR;
4879         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4880         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4881         matcher.reset(s1);
4882         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4883         REGEX_CHECK_STATUS;
4884         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4885
4886 #if 0
4887         // Now a match that will succeed, but after an interruption
4888         status = U_ZERO_ERROR;
4889         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4890         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4891         matcher.reset(s2);
4892         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4893         REGEX_CHECK_STATUS;
4894         // Now retry the match from where left off
4895         cbInfo.maxCalls = 100; //  No callback limit
4896         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4897         REGEX_CHECK_STATUS;
4898 #endif
4899     }
4900
4901
4902 }
4903
4904
4905 //---------------------------------------------------------------------------
4906 //
4907 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4908 //                             UTexts. The pure-C implementation of UText
4909 //                             has no mutable backing stores, but we can
4910 //                             use UnicodeString here to test the functionality.
4911 //
4912 //---------------------------------------------------------------------------
4913 void RegexTest::PreAllocatedUTextCAPI () {
4914     UErrorCode           status = U_ZERO_ERROR;
4915     URegularExpression  *re;
4916     UText                patternText = UTEXT_INITIALIZER;
4917     UnicodeString        buffer;
4918     UText                bufferText = UTEXT_INITIALIZER;
4919
4920     utext_openUnicodeString(&bufferText, &buffer, &status);
4921
4922     /*
4923      *  getText() and getUText()
4924      */
4925     {
4926         UText  text1 = UTEXT_INITIALIZER;
4927         UText  text2 = UTEXT_INITIALIZER;
4928         UChar  text2Chars[20];
4929         UText  *resultText;
4930
4931         status = U_ZERO_ERROR;
4932         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4933         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4934         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4935         utext_openUChars(&text2, text2Chars, -1, &status);
4936
4937         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4938         re = uregex_openUText(&patternText, 0, NULL, &status);
4939
4940         /* First set a UText */
4941         uregex_setUText(re, &text1, &status);
4942         resultText = uregex_getUText(re, &bufferText, &status);
4943         REGEX_CHECK_STATUS;
4944         REGEX_ASSERT(resultText == &bufferText);
4945         utext_setNativeIndex(resultText, 0);
4946         utext_setNativeIndex(&text1, 0);
4947         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4948
4949         resultText = uregex_getUText(re, &bufferText, &status);
4950         REGEX_CHECK_STATUS;
4951         REGEX_ASSERT(resultText == &bufferText);
4952         utext_setNativeIndex(resultText, 0);
4953         utext_setNativeIndex(&text1, 0);
4954         REGEX_ASSERT(testUTextEqual(resultText, &text1));
4955
4956         /* Then set a UChar * */
4957         uregex_setText(re, text2Chars, 7, &status);
4958         resultText = uregex_getUText(re, &bufferText, &status);
4959         REGEX_CHECK_STATUS;
4960         REGEX_ASSERT(resultText == &bufferText);
4961         utext_setNativeIndex(resultText, 0);
4962         utext_setNativeIndex(&text2, 0);
4963         REGEX_ASSERT(testUTextEqual(resultText, &text2));
4964
4965         uregex_close(re);
4966         utext_close(&text1);
4967         utext_close(&text2);
4968     }
4969
4970     /*
4971      *  group()
4972      */
4973     {
4974         UChar    text1[80];
4975         UText   *actual;
4976         UBool    result;
4977         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  sizeof(text1)/2);
4978
4979         status = U_ZERO_ERROR;
4980         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
4981         REGEX_CHECK_STATUS;
4982
4983         uregex_setText(re, text1, -1, &status);
4984         result = uregex_find(re, 0, &status);
4985         REGEX_ASSERT(result==TRUE);
4986
4987         /*  Capture Group 0, the full match.  Should succeed.  */
4988         status = U_ZERO_ERROR;
4989         actual = uregex_groupUTextDeep(re, 0, &bufferText, &status);
4990         REGEX_CHECK_STATUS;
4991         REGEX_ASSERT(actual == &bufferText);
4992         REGEX_ASSERT_UTEXT_INVARIANT("abc interior def", actual);
4993
4994         /*  Capture group #1.  Should succeed. */
4995         status = U_ZERO_ERROR;
4996         actual = uregex_groupUTextDeep(re, 1, &bufferText, &status);
4997         REGEX_CHECK_STATUS;
4998         REGEX_ASSERT(actual == &bufferText);
4999         REGEX_ASSERT_UTEXT_INVARIANT(" interior ", actual);
5000
5001         /*  Capture group out of range.  Error. */
5002         status = U_ZERO_ERROR;
5003         actual = uregex_groupUTextDeep(re, 2, &bufferText, &status);
5004         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5005         REGEX_ASSERT(actual == &bufferText);
5006
5007         uregex_close(re);
5008
5009     }
5010
5011     /*
5012      *  replaceFirst()
5013      */
5014     {
5015         UChar    text1[80];
5016         UChar    text2[80];
5017         UText    replText = UTEXT_INITIALIZER;
5018         UText   *result;
5019
5020         status = U_ZERO_ERROR;
5021         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5022         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5023         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5024
5025         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5026         REGEX_CHECK_STATUS;
5027
5028         /*  Normal case, with match */
5029         uregex_setText(re, text1, -1, &status);
5030         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5031         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5032         REGEX_CHECK_STATUS;
5033         REGEX_ASSERT(result == &bufferText);
5034         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5035
5036         /* No match.  Text should copy to output with no changes.  */
5037         uregex_setText(re, text2, -1, &status);
5038         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5039         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5040         REGEX_CHECK_STATUS;
5041         REGEX_ASSERT(result == &bufferText);
5042         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5043
5044         /* Unicode escapes */
5045         uregex_setText(re, text1, -1, &status);
5046         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
5047         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5048         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5049         REGEX_CHECK_STATUS;
5050         REGEX_ASSERT(result == &bufferText);
5051         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5052
5053         uregex_close(re);
5054         utext_close(&replText);
5055     }
5056
5057
5058     /*
5059      *  replaceAll()
5060      */
5061     {
5062         UChar    text1[80];
5063         UChar    text2[80];
5064         UText    replText = UTEXT_INITIALIZER;
5065         UText   *result;
5066
5067         status = U_ZERO_ERROR;
5068         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5069         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5070         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5071
5072         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5073         REGEX_CHECK_STATUS;
5074
5075         /*  Normal case, with match */
5076         uregex_setText(re, text1, -1, &status);
5077         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5078         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5079         REGEX_CHECK_STATUS;
5080         REGEX_ASSERT(result == &bufferText);
5081         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5082
5083         /* No match.  Text should copy to output with no changes.  */
5084         uregex_setText(re, text2, -1, &status);
5085         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5086         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5087         REGEX_CHECK_STATUS;
5088         REGEX_ASSERT(result == &bufferText);
5089         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5090
5091         uregex_close(re);
5092         utext_close(&replText);
5093     }
5094
5095
5096     /*
5097      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5098      *   so we don't need to test it here.
5099      */
5100
5101     utext_close(&bufferText);
5102     utext_close(&patternText);
5103 }
5104
5105 //--------------------------------------------------------------
5106 //
5107 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5108 //
5109 //---------------------------------------------------------------
5110 void RegexTest::Bug7651() {
5111     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5112     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5113     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5114     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5115     UnicodeString s("#ff @abcd This is test");
5116     RegexPattern  *REPattern = NULL;
5117     RegexMatcher  *REMatcher = NULL;
5118     UErrorCode status = U_ZERO_ERROR;
5119     UParseError pe;
5120
5121     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5122     REGEX_CHECK_STATUS;
5123     REMatcher = REPattern->matcher(s, status);
5124     REGEX_CHECK_STATUS;
5125     REGEX_ASSERT(REMatcher->find());
5126     REGEX_ASSERT(REMatcher->start(status) == 0);
5127     delete REPattern;
5128     delete REMatcher;
5129     status = U_ZERO_ERROR;
5130
5131     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5132     REGEX_CHECK_STATUS;
5133     REMatcher = REPattern->matcher(s, status);
5134     REGEX_CHECK_STATUS;
5135     REGEX_ASSERT(REMatcher->find());
5136     REGEX_ASSERT(REMatcher->start(status) == 0);
5137     delete REPattern;
5138     delete REMatcher;
5139     status = U_ZERO_ERROR;
5140  }
5141
5142 void RegexTest::Bug7740() {
5143     UErrorCode status = U_ZERO_ERROR;
5144     UnicodeString pattern = "(a)";
5145     UnicodeString text = "abcdef";
5146     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5147     REGEX_CHECK_STATUS;
5148     REGEX_ASSERT(m->lookingAt(status));
5149     REGEX_CHECK_STATUS;
5150     status = U_ILLEGAL_ARGUMENT_ERROR;
5151     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5152     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5153     REGEX_ASSERT(s == "");
5154     delete m;
5155 }
5156
5157 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5158
5159 void RegexTest::Bug8479() {
5160     UErrorCode status = U_ZERO_ERROR;
5161
5162     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5163     REGEX_CHECK_STATUS;
5164     if (U_SUCCESS(status))
5165     {
5166         UnicodeString str;
5167         str.setToBogus();
5168         pMatcher->reset(str);
5169         status = U_ZERO_ERROR;
5170         pMatcher->matches(status);
5171         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5172         delete pMatcher;
5173     }
5174 }
5175
5176
5177 // Bug 7029
5178 void RegexTest::Bug7029() {
5179     UErrorCode status = U_ZERO_ERROR;
5180
5181     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5182     UnicodeString text = "abc.def";
5183     UnicodeString splits[10];
5184     REGEX_CHECK_STATUS;
5185     int32_t numFields = pMatcher->split(text, splits, 10, status);
5186     REGEX_CHECK_STATUS;
5187     REGEX_ASSERT(numFields == 8);
5188     delete pMatcher;
5189 }
5190
5191 // Bug 9283
5192 //   This test is checking for the existance of any supplemental characters that case-fold
5193 //   to a bmp character.
5194 //
5195 //   At the time of this writing there are none. If any should appear in a subsequent release
5196 //   of Unicode, the code in regular expressions compilation that determines the longest
5197 //   posssible match for a literal string  will need to be enhanced.
5198 //
5199 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5200 //   for details on what to do in case of a failure of this test.
5201 //
5202 void RegexTest::Bug9283() {
5203     UErrorCode status = U_ZERO_ERROR;
5204     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5205     REGEX_CHECK_STATUS;
5206     int32_t index;
5207     UChar32 c;
5208     for (index=0; ; index++) {
5209         c = supplementalsWithCaseFolding.charAt(index);
5210         if (c == -1) {
5211             break;
5212         }
5213         UnicodeString cf = UnicodeString(c).foldCase();
5214         REGEX_ASSERT(cf.length() >= 2);
5215     }
5216 }
5217
5218
5219 void RegexTest::CheckInvBufSize() {
5220   if(inv_next>=INV_BUFSIZ) {
5221     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5222           __FILE__, INV_BUFSIZ, inv_next);
5223   } else {
5224     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5225   }
5226 }
5227
5228 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5229