icuSources/test/intltest/regextst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 2002-2008, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6
   7 //
   8 //   regextst.cpp
   9 //
  10 //      ICU Regular Expressions test, part of intltest.
  11 //
  12
  13 #include "intltest.h"
  14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  15
  16 #include "unicode/regex.h"
  17 #include "unicode/uchar.h"
  18 #include "unicode/ucnv.h"
  19 #include "regextst.h"
  20 #include "uvector.h"
  21 #include "util.h"
  22 #include <stdlib.h>
  23 #include <string.h>
  24 #include <stdio.h>
  25
  26
  27 //---------------------------------------------------------------------------
  28 //
  29 //  Test class boilerplate
  30 //
  31 //---------------------------------------------------------------------------
  32 RegexTest::RegexTest()
  33 {
  34 }
  35
  36
  37 RegexTest::~RegexTest()
  38 {
  39 }
  40
  41
  42
  43 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
  44 {
  45     if (exec) logln("TestSuite RegexTest: ");
  46     switch (index) {
  47
  48         case 0: name = "Basic";
  49             if (exec) Basic();
  50             break;
  51         case 1: name = "API_Match";
  52             if (exec) API_Match();
  53             break;
  54         case 2: name = "API_Replace";
  55             if (exec) API_Replace();
  56             break;
  57         case 3: name = "API_Pattern";
  58             if (exec) API_Pattern();
  59             break;
  60         case 4: name = "Extended";
  61             if (exec) Extended();
  62             break;
  63         case 5: name = "Errors";
  64             if (exec) Errors();
  65             break;
  66         case 6: name = "PerlTests";
  67             if (exec) PerlTests();
  68             break;
  69         case 7: name = "Callbacks";
  70           if (exec) Callbacks();
  71           break;
  72
  73         default: name = "";
  74             break; //needed to end loop
  75     }
  76 }
  77
  78
  79 //---------------------------------------------------------------------------
  80 //
  81 //   Error Checking / Reporting macros used in all of the tests.
  82 //
  83 //---------------------------------------------------------------------------
  84 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {errln("RegexTest failure at line %d.  status=%s\n", \
  85 __LINE__, u_errorName(status)); return;}}
  86
  87 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("RegexTest failure at line %d.\n", __LINE__);};}
  88
  89 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
  90 if (status!=errcode) {errln("RegexTest failure at line %d.  Expected status=%s, got %s\n", \
  91     __LINE__, u_errorName(errcode), u_errorName(status));};}
  92
  93 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
  94     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
  95
  96 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
  97     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
  98
  99
 100
 101 //---------------------------------------------------------------------------
 102 //
 103 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
 104 //                       for the LookingAt() and  Match() functions.
 105 //
 106 //       usage:
 107 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
 108 //
 109 //          The expected results are UBool - TRUE or FALSE.
 110 //          The input text is unescaped.  The pattern is not.
 111 //
 112 //
 113 //---------------------------------------------------------------------------
 114
 115 #define REGEX_TESTLM(pat, text, looking, match) doRegexLMTest(pat, text, looking, match, __LINE__);
 116
 117 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
 118     const UnicodeString pattern(pat, -1, US_INV);
 119     const UnicodeString inputText(text, -1, US_INV);
 120     UErrorCode          status  = U_ZERO_ERROR;
 121     UParseError         pe;
 122     RegexPattern        *REPattern = NULL;
 123     RegexMatcher        *REMatcher = NULL;
 124     UBool               retVal     = TRUE;
 125
 126     UnicodeString patString(pat, -1, US_INV);
 127     REPattern = RegexPattern::compile(patString, 0, pe, status);
 128     if (U_FAILURE(status)) {
 129         errln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s\n",
 130             line, u_errorName(status));
 131         return FALSE;
 132     }
 133     if (line==376) { RegexPatternDump(REPattern);}
 134
 135     UnicodeString inputString(inputText);
 136     UnicodeString unEscapedInput = inputString.unescape();
 137     REMatcher = REPattern->matcher(unEscapedInput, status);
 138     if (U_FAILURE(status)) {
 139         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
 140             line, u_errorName(status));
 141         return FALSE;
 142     }
 143
 144     UBool actualmatch;
 145     actualmatch = REMatcher->lookingAt(status);
 146     if (U_FAILURE(status)) {
 147         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
 148             line, u_errorName(status));
 149         retVal =  FALSE;
 150     }
 151     if (actualmatch != looking) {
 152         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
 153         retVal = FALSE;
 154     }
 155
 156     status = U_ZERO_ERROR;
 157     actualmatch = REMatcher->matches(status);
 158     if (U_FAILURE(status)) {
 159         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
 160             line, u_errorName(status));
 161         retVal = FALSE;
 162     }
 163     if (actualmatch != match) {
 164         errln("RegexTest: wrong return from matches() at line %d.\n", line);
 165         retVal = FALSE;
 166     }
 167
 168     if (retVal == FALSE) {
 169         RegexPatternDump(REPattern);
 170     }
 171
 172     delete REPattern;
 173     delete REMatcher;
 174     return retVal;
 175 }
 176
 177
 178
 179
 180
 181 //---------------------------------------------------------------------------
 182 //
 183 //    REGEX_ERR       Macro + invocation function to simplify writing tests
 184 //                       regex tests for incorrect patterns
 185 //
 186 //       usage:
 187 //          REGEX_ERR("pattern",   expected error line, column, expected status);
 188 //
 189 //---------------------------------------------------------------------------
 190 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
 191
 192 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
 193                           UErrorCode expectedStatus, int32_t line) {
 194     UnicodeString       pattern(pat);
 195
 196     UErrorCode          status         = U_ZERO_ERROR;
 197     UParseError         pe;
 198     RegexPattern        *callerPattern = NULL;
 199
 200     //
 201     //  Compile the caller's pattern
 202     //
 203     UnicodeString patString(pat);
 204     callerPattern = RegexPattern::compile(patString, 0, pe, status);
 205     if (status != expectedStatus) {
 206         errln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
 207     } else {
 208         if (status != U_ZERO_ERROR) {
 209             if (pe.line != errLine || pe.offset != errCol) {
 210                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
 211                     line, errLine, errCol, pe.line, pe.offset);
 212             }
 213         }
 214     }
 215
 216     delete callerPattern;
 217 }
 218
 219
 220
 221 //---------------------------------------------------------------------------
 222 //
 223 //      Basic      Check for basic functionality of regex pattern matching.
 224 //                 Avoid the use of REGEX_FIND test macro, which has
 225 //                 substantial dependencies on basic Regex functionality.
 226 //
 227 //---------------------------------------------------------------------------
 228 void RegexTest::Basic() {
 229
 230
 231 //
 232 // Debug - slide failing test cases early
 233 //
 234 #if 0
 235     {
 236         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
 237         UParseError pe;
 238         UErrorCode  status = U_ZERO_ERROR;
 239         RegexPattern::compile("^(?:a?b?)*$", 0, pe, status);
 240         // REGEX_FIND("(?>(abc{2,4}?))(c*)", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
 241         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
 242     }
 243     exit(1);
 244 #endif
 245
 246
 247     //
 248     // Pattern with parentheses
 249     //
 250     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
 251     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
 252     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
 253
 254     //
 255     // Patterns with *
 256     //
 257     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
 258     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
 259     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
 260     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
 261     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
 262
 263     REGEX_TESTLM("a*", "",  TRUE, TRUE);
 264     REGEX_TESTLM("a*", "b", TRUE, FALSE);
 265
 266
 267     //
 268     //  Patterns with "."
 269     //
 270     REGEX_TESTLM(".", "abc", TRUE, FALSE);
 271     REGEX_TESTLM("...", "abc", TRUE, TRUE);
 272     REGEX_TESTLM("....", "abc", FALSE, FALSE);
 273     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
 274     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
 275     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
 276     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
 277     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
 278
 279     //
 280     //  Patterns with * applied to chars at end of literal string
 281     //
 282     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
 283     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
 284
 285     //
 286     //  Supplemental chars match as single chars, not a pair of surrogates.
 287     //
 288     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
 289     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
 290     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
 291
 292
 293     //
 294     //  UnicodeSets in the pattern
 295     //
 296     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
 297     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
 298     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
 299     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 300     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
 301     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
 302
 303     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
 304     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
 305     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
 306     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
 307     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
 308
 309     //
 310     //   OR operator in patterns
 311     //
 312     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
 313     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
 314     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
 315     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
 316
 317     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
 318     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
 319     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
 320     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
 321     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
 322     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
 323
 324     //
 325     //  +
 326     //
 327     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
 328     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
 329     REGEX_TESTLM("b+", "", FALSE, FALSE);
 330     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
 331     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
 332     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
 333
 334     //
 335     //   ?
 336     //
 337     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
 338     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
 339     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
 340     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
 341     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
 342     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
 343     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
 344     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
 345     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
 346
 347     //
 348     //  Escape sequences that become single literal chars, handled internally
 349     //   by ICU's Unescape.
 350     //
 351
 352     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
 353     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
 354     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
 355     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
 356     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
 357     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
 358     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
 359     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
 360     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
 361     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
 362
 363     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
 364     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
 365
 366     // Escape of special chars in patterns
 367     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
 368
 369
 370 }
 371
 372
 373 //---------------------------------------------------------------------------
 374 //
 375 //      API_Match   Test that the API for class RegexMatcher
 376 //                  is present and nominally working, but excluding functions
 377 //                  implementing replace operations.
 378 //
 379 //---------------------------------------------------------------------------
 380 void RegexTest::API_Match() {
 381     UParseError         pe;
 382     UErrorCode          status=U_ZERO_ERROR;
 383     int32_t             flags = 0;
 384
 385     //
 386     // Debug - slide failing test cases early
 387     //
 388 #if 0
 389     {
 390     }
 391     return;
 392 #endif
 393
 394     //
 395     // Simple pattern compilation
 396     //
 397     {
 398         UnicodeString       re("abc");
 399         RegexPattern        *pat2;
 400         pat2 = RegexPattern::compile(re, flags, pe, status);
 401         REGEX_CHECK_STATUS;
 402
 403         UnicodeString inStr1 = "abcdef this is a test";
 404         UnicodeString instr2 = "not abc";
 405         UnicodeString empty  = "";
 406
 407
 408         //
 409         // Matcher creation and reset.
 410         //
 411         RegexMatcher *m1 = pat2->matcher(inStr1, status);
 412         REGEX_CHECK_STATUS;
 413         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 414         REGEX_ASSERT(m1->input() == inStr1);
 415         m1->reset(instr2);
 416         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 417         REGEX_ASSERT(m1->input() == instr2);
 418         m1->reset(inStr1);
 419         REGEX_ASSERT(m1->input() == inStr1);
 420         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 421         m1->reset(empty);
 422         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
 423         REGEX_ASSERT(m1->input() == empty);
 424         REGEX_ASSERT(&m1->pattern() == pat2);
 425
 426         //
 427         //  reset(pos, status)
 428         //
 429         m1->reset(inStr1);
 430         m1->reset(4, status);
 431         REGEX_CHECK_STATUS;
 432         REGEX_ASSERT(m1->input() == inStr1);
 433         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
 434
 435         m1->reset(-1, status);
 436         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 437         status = U_ZERO_ERROR;
 438
 439         m1->reset(0, status);
 440         REGEX_CHECK_STATUS;
 441         status = U_ZERO_ERROR;
 442
 443         int32_t len = m1->input().length();
 444         m1->reset(len-1, status);
 445         REGEX_CHECK_STATUS;
 446         status = U_ZERO_ERROR;
 447
 448         m1->reset(len, status);
 449         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 450         status = U_ZERO_ERROR;
 451
 452         //
 453         // match(pos, status)
 454         //
 455         m1->reset(instr2);
 456         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 457         m1->reset();
 458         REGEX_ASSERT(m1->matches(3, status) == FALSE);
 459         m1->reset();
 460         REGEX_ASSERT(m1->matches(5, status) == FALSE);
 461         REGEX_ASSERT(m1->matches(4, status) == TRUE);
 462         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
 463         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 464
 465         // Match() at end of string should fail, but should not
 466         //  be an error.
 467         status = U_ZERO_ERROR;
 468         len = m1->input().length();
 469         REGEX_ASSERT(m1->matches(len, status) == FALSE);
 470         REGEX_CHECK_STATUS;
 471
 472         // Match beyond end of string should fail with an error.
 473         status = U_ZERO_ERROR;
 474         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
 475         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 476
 477         // Successful match at end of string.
 478         {
 479             status = U_ZERO_ERROR;
 480             RegexMatcher m("A?", 0, status);  // will match zero length string.
 481             REGEX_CHECK_STATUS;
 482             m.reset(inStr1);
 483             len = inStr1.length();
 484             REGEX_ASSERT(m.matches(len, status) == TRUE);
 485             REGEX_CHECK_STATUS;
 486             m.reset(empty);
 487             REGEX_ASSERT(m.matches(0, status) == TRUE);
 488             REGEX_CHECK_STATUS;
 489         }
 490
 491
 492         //
 493         // lookingAt(pos, status)
 494         //
 495         status = U_ZERO_ERROR;
 496         m1->reset(instr2);  // "not abc"
 497         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 498         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
 499         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
 500         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
 501         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
 502         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 503         status = U_ZERO_ERROR;
 504         len = m1->input().length();
 505         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
 506         REGEX_CHECK_STATUS;
 507         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
 508         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
 509
 510         delete m1;
 511         delete pat2;
 512     }
 513
 514
 515     //
 516     // Capture Group.
 517     //     RegexMatcher::start();
 518     //     RegexMatcher::end();
 519     //     RegexMatcher::groupCount();
 520     //
 521     {
 522         int32_t             flags=0;
 523         UParseError         pe;
 524         UErrorCode          status=U_ZERO_ERROR;
 525
 526         UnicodeString       re("01(23(45)67)(.*)");
 527         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 528         REGEX_CHECK_STATUS;
 529         UnicodeString data = "0123456789";
 530
 531         RegexMatcher *matcher = pat->matcher(data, status);
 532         REGEX_CHECK_STATUS;
 533         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
 534         static const int32_t matchStarts[] = {0,  2, 4, 8};
 535         static const int32_t matchEnds[]   = {10, 8, 6, 10};
 536         int32_t i;
 537         for (i=0; i<4; i++) {
 538             int32_t actualStart = matcher->start(i, status);
 539             REGEX_CHECK_STATUS;
 540             if (actualStart != matchStarts[i]) {
 541                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
 542                     __LINE__, i, matchStarts[i], actualStart);
 543             }
 544             int32_t actualEnd = matcher->end(i, status);
 545             REGEX_CHECK_STATUS;
 546             if (actualEnd != matchEnds[i]) {
 547                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
 548                     __LINE__, i, matchEnds[i], actualEnd);
 549             }
 550         }
 551
 552         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
 553         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
 554
 555         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 556         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 557         matcher->reset();
 558         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
 559
 560         matcher->lookingAt(status);
 561         REGEX_ASSERT(matcher->group(status)    == "0123456789");
 562         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
 563         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
 564         REGEX_ASSERT(matcher->group(2, status) == "45"        );
 565         REGEX_ASSERT(matcher->group(3, status) == "89"        );
 566         REGEX_CHECK_STATUS;
 567         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 568         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
 569         matcher->reset();
 570         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
 571
 572         delete matcher;
 573         delete pat;
 574
 575     }
 576
 577     //
 578     //  find
 579     //
 580     {
 581         int32_t             flags=0;
 582         UParseError         pe;
 583         UErrorCode          status=U_ZERO_ERROR;
 584
 585         UnicodeString       re("abc");
 586         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 587         REGEX_CHECK_STATUS;
 588         UnicodeString data = ".abc..abc...abc..";
 589         //                    012345678901234567
 590
 591         RegexMatcher *matcher = pat->matcher(data, status);
 592         REGEX_CHECK_STATUS;
 593         REGEX_ASSERT(matcher->find());
 594         REGEX_ASSERT(matcher->start(status) == 1);
 595         REGEX_ASSERT(matcher->find());
 596         REGEX_ASSERT(matcher->start(status) == 6);
 597         REGEX_ASSERT(matcher->find());
 598         REGEX_ASSERT(matcher->start(status) == 12);
 599         REGEX_ASSERT(matcher->find() == FALSE);
 600         REGEX_ASSERT(matcher->find() == FALSE);
 601
 602         matcher->reset();
 603         REGEX_ASSERT(matcher->find());
 604         REGEX_ASSERT(matcher->start(status) == 1);
 605
 606         REGEX_ASSERT(matcher->find(0, status));
 607         REGEX_ASSERT(matcher->start(status) == 1);
 608         REGEX_ASSERT(matcher->find(1, status));
 609         REGEX_ASSERT(matcher->start(status) == 1);
 610         REGEX_ASSERT(matcher->find(2, status));
 611         REGEX_ASSERT(matcher->start(status) == 6);
 612         REGEX_ASSERT(matcher->find(12, status));
 613         REGEX_ASSERT(matcher->start(status) == 12);
 614         REGEX_ASSERT(matcher->find(13, status) == FALSE);
 615         REGEX_ASSERT(matcher->find(16, status) == FALSE);
 616         REGEX_ASSERT(matcher->find(17, status) == FALSE);
 617         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
 618
 619         status = U_ZERO_ERROR;
 620         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
 621         status = U_ZERO_ERROR;
 622         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
 623
 624         REGEX_ASSERT(matcher->groupCount() == 0);
 625
 626         delete matcher;
 627         delete pat;
 628     }
 629
 630
 631     //
 632     //  find, with \G in pattern (true if at the end of a previous match).
 633     //
 634     {
 635         int32_t             flags=0;
 636         UParseError         pe;
 637         UErrorCode          status=U_ZERO_ERROR;
 638
 639         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
 640         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 641         REGEX_CHECK_STATUS;
 642         UnicodeString data = ".abcabc.abc..";
 643         //                    012345678901234567
 644
 645         RegexMatcher *matcher = pat->matcher(data, status);
 646         REGEX_CHECK_STATUS;
 647         REGEX_ASSERT(matcher->find());
 648         REGEX_ASSERT(matcher->start(status) == 0);
 649         REGEX_ASSERT(matcher->start(1, status) == -1);
 650         REGEX_ASSERT(matcher->start(2, status) == 1);
 651
 652         REGEX_ASSERT(matcher->find());
 653         REGEX_ASSERT(matcher->start(status) == 4);
 654         REGEX_ASSERT(matcher->start(1, status) == 4);
 655         REGEX_ASSERT(matcher->start(2, status) == -1);
 656         REGEX_CHECK_STATUS;
 657
 658         delete matcher;
 659         delete pat;
 660     }
 661
 662     //
 663     //   find with zero length matches, match position should bump ahead
 664     //     to prevent loops.
 665     //
 666     {
 667         int32_t                 i;
 668         UErrorCode          status=U_ZERO_ERROR;
 669         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
 670                                                       //   using an always-true look-ahead.
 671         REGEX_CHECK_STATUS;
 672         UnicodeString s("    ");
 673         m.reset(s);
 674         for (i=0; ; i++) {
 675             if (m.find() == FALSE) {
 676                 break;
 677             }
 678             REGEX_ASSERT(m.start(status) == i);
 679             REGEX_ASSERT(m.end(status) == i);
 680         }
 681         REGEX_ASSERT(i==5);
 682
 683         // Check that the bump goes over surrogate pairs OK
 684         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
 685         s = s.unescape();
 686         m.reset(s);
 687         for (i=0; ; i+=2) {
 688             if (m.find() == FALSE) {
 689                 break;
 690             }
 691             REGEX_ASSERT(m.start(status) == i);
 692             REGEX_ASSERT(m.end(status) == i);
 693         }
 694         REGEX_ASSERT(i==10);
 695     }
 696     {
 697         // find() loop breaking test.
 698         //        with pattern of /.?/, should see a series of one char matches, then a single
 699         //        match of zero length at the end of the input string.
 700         int32_t                 i;
 701         UErrorCode          status=U_ZERO_ERROR;
 702         RegexMatcher        m(".?", 0, status);
 703         REGEX_CHECK_STATUS;
 704         UnicodeString s("    ");
 705         m.reset(s);
 706         for (i=0; ; i++) {
 707             if (m.find() == FALSE) {
 708                 break;
 709             }
 710             REGEX_ASSERT(m.start(status) == i);
 711             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
 712         }
 713         REGEX_ASSERT(i==5);
 714     }
 715
 716
 717     //
 718     // Matchers with no input string behave as if they had an empty input string.
 719     //
 720
 721     {
 722         UErrorCode status = U_ZERO_ERROR;
 723         RegexMatcher  m(".?", 0, status);
 724         REGEX_CHECK_STATUS;
 725         REGEX_ASSERT(m.find());
 726         REGEX_ASSERT(m.start(status) == 0);
 727         REGEX_ASSERT(m.input() == "");
 728     }
 729     {
 730         UErrorCode status = U_ZERO_ERROR;
 731         RegexPattern  *p = RegexPattern::compile(".", 0, status);
 732         RegexMatcher  *m = p->matcher(status);
 733         REGEX_CHECK_STATUS;
 734
 735         REGEX_ASSERT(m->find() == FALSE);
 736         REGEX_ASSERT(m->input() == "");
 737         delete m;
 738         delete p;
 739     }
 740
 741     //
 742     // Regions
 743     //
 744     {
 745         UErrorCode status = U_ZERO_ERROR;
 746         UnicodeString testString("This is test data");
 747         RegexMatcher m(".*", testString,  0, status);
 748         REGEX_CHECK_STATUS;
 749         REGEX_ASSERT(m.regionStart() == 0);
 750         REGEX_ASSERT(m.regionEnd() == testString.length());
 751         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
 752         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
 753
 754         m.region(2,4, status);
 755         REGEX_CHECK_STATUS;
 756         REGEX_ASSERT(m.matches(status));
 757         REGEX_ASSERT(m.start(status)==2);
 758         REGEX_ASSERT(m.end(status)==4);
 759         REGEX_CHECK_STATUS;
 760
 761         m.reset();
 762         REGEX_ASSERT(m.regionStart() == 0);
 763         REGEX_ASSERT(m.regionEnd() == testString.length());
 764
 765         UnicodeString shorterString("short");
 766         m.reset(shorterString);
 767         REGEX_ASSERT(m.regionStart() == 0);
 768         REGEX_ASSERT(m.regionEnd() == shorterString.length());
 769
 770         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
 771         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
 772         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
 773         REGEX_ASSERT(&m == &m.reset());
 774         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
 775
 776         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
 777         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
 778         REGEX_ASSERT(&m == &m.reset());
 779         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
 780
 781         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
 782         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
 783         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
 784         REGEX_ASSERT(&m == &m.reset());
 785         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
 786
 787         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
 788         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
 789         REGEX_ASSERT(&m == &m.reset());
 790         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
 791
 792     }
 793
 794     //
 795     // hitEnd() and requireEnd()
 796     //
 797     {
 798         UErrorCode status = U_ZERO_ERROR;
 799         UnicodeString testString("aabb");
 800         RegexMatcher m1(".*", testString,  0, status);
 801         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
 802         REGEX_ASSERT(m1.hitEnd() == TRUE);
 803         REGEX_ASSERT(m1.requireEnd() == FALSE);
 804         REGEX_CHECK_STATUS;
 805
 806         status = U_ZERO_ERROR;
 807         RegexMatcher m2("a*", testString, 0, status);
 808         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
 809         REGEX_ASSERT(m2.hitEnd() == FALSE);
 810         REGEX_ASSERT(m2.requireEnd() == FALSE);
 811         REGEX_CHECK_STATUS;
 812
 813         status = U_ZERO_ERROR;
 814         RegexMatcher m3(".*$", testString, 0, status);
 815         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
 816         REGEX_ASSERT(m3.hitEnd() == TRUE);
 817         REGEX_ASSERT(m3.requireEnd() == TRUE);
 818         REGEX_CHECK_STATUS;
 819     }
 820
 821
 822     //
 823     // Compilation error on reset with UChar *
 824     //   These were a hazard that people were stumbling over with runtime errors.
 825     //   Changed them to compiler errors by adding private methods that more closely
 826     //   matched the incorrect use of the functions.
 827     //
 828 #if 0
 829     {
 830         UErrorCode status = U_ZERO_ERROR;
 831         UChar ucharString[20];
 832         RegexMatcher m(".", 0, status);
 833         m.reset(ucharString);  // should not compile.
 834
 835         RegexPattern *p = RegexPattern::compile(".", 0, status);
 836         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
 837
 838         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
 839     }
 840 #endif
 841
 842     //
 843     //  Time Outs.
 844     //       Note:  These tests will need to be changed when the regexp engine is
 845     //              able to detect and cut short the exponential time behavior on
 846     //              this type of match.
 847     //
 848     {
 849         UErrorCode status = U_ZERO_ERROR;
 850         //    Enough 'a's in the string to cause the match to time out.
 851         //       (Each on additonal 'a' doubles the time)
 852         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
 853         RegexMatcher matcher("(a+)+b", testString, 0, status);
 854         REGEX_CHECK_STATUS;
 855         REGEX_ASSERT(matcher.getTimeLimit() == 0);
 856         matcher.setTimeLimit(100, status);
 857         REGEX_ASSERT(matcher.getTimeLimit() == 100);
 858         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
 859         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
 860     }
 861     {
 862         UErrorCode status = U_ZERO_ERROR;
 863         //   Few enough 'a's to slip in under the time limit.
 864         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
 865         RegexMatcher matcher("(a+)+b", testString, 0, status);
 866         REGEX_CHECK_STATUS;
 867         matcher.setTimeLimit(100, status);
 868         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
 869         REGEX_CHECK_STATUS;
 870     }
 871
 872     //
 873     //  Stack Limits
 874     //
 875     {
 876         UErrorCode status = U_ZERO_ERROR;
 877         UnicodeString testString(600000, 0x41, 600000);  // Length 600,000, filled with 'A'
 878
 879         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
 880         //   of the '+', and makes the stack frames larger.
 881         RegexMatcher matcher("(A)+A$", testString, 0, status);
 882
 883         // With the default stack, this match should fail to run
 884         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
 885         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
 886
 887         // With unlimited stack, it should run
 888         status = U_ZERO_ERROR;
 889         matcher.setStackLimit(0, status);
 890         REGEX_CHECK_STATUS;
 891         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
 892         REGEX_CHECK_STATUS;
 893         REGEX_ASSERT(matcher.getStackLimit() == 0);
 894
 895         // With a limited stack, it the match should fail
 896         status = U_ZERO_ERROR;
 897         matcher.setStackLimit(10000, status);
 898         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
 899         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
 900         REGEX_ASSERT(matcher.getStackLimit() == 10000);
 901     }
 902
 903         // A pattern that doesn't save state should work with
 904         //   a minimal sized stack
 905     {
 906         UErrorCode status = U_ZERO_ERROR;
 907         UnicodeString testString = "abc";
 908         RegexMatcher matcher("abc", testString, 0, status);
 909         REGEX_CHECK_STATUS;
 910         matcher.setStackLimit(30, status);
 911         REGEX_CHECK_STATUS;
 912         REGEX_ASSERT(matcher.matches(status) == TRUE);
 913         REGEX_CHECK_STATUS;
 914         REGEX_ASSERT(matcher.getStackLimit() == 30);
 915
 916         // Negative stack sizes should fail
 917         status = U_ZERO_ERROR;
 918         matcher.setStackLimit(1000, status);
 919         REGEX_CHECK_STATUS;
 920         matcher.setStackLimit(-1, status);
 921         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
 922         REGEX_ASSERT(matcher.getStackLimit() == 1000);
 923     }
 924
 925
 926 }
 927
 928
 929
 930
 931
 932
 933 //---------------------------------------------------------------------------
 934 //
 935 //      API_Replace        API test for class RegexMatcher, testing the
 936 //                         Replace family of functions.
 937 //
 938 //---------------------------------------------------------------------------
 939 void RegexTest::API_Replace() {
 940     //
 941     //  Replace
 942     //
 943     int32_t             flags=0;
 944     UParseError         pe;
 945     UErrorCode          status=U_ZERO_ERROR;
 946
 947     UnicodeString       re("abc");
 948     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
 949     REGEX_CHECK_STATUS;
 950     UnicodeString data = ".abc..abc...abc..";
 951     //                    012345678901234567
 952     RegexMatcher *matcher = pat->matcher(data, status);
 953
 954     //
 955     //  Plain vanilla matches.
 956     //
 957     UnicodeString  dest;
 958     dest = matcher->replaceFirst("yz", status);
 959     REGEX_CHECK_STATUS;
 960     REGEX_ASSERT(dest == ".yz..abc...abc..");
 961
 962     dest = matcher->replaceAll("yz", status);
 963     REGEX_CHECK_STATUS;
 964     REGEX_ASSERT(dest == ".yz..yz...yz..");
 965
 966     //
 967     //  Plain vanilla non-matches.
 968     //
 969     UnicodeString d2 = ".abx..abx...abx..";
 970     matcher->reset(d2);
 971     dest = matcher->replaceFirst("yz", status);
 972     REGEX_CHECK_STATUS;
 973     REGEX_ASSERT(dest == ".abx..abx...abx..");
 974
 975     dest = matcher->replaceAll("yz", status);
 976     REGEX_CHECK_STATUS;
 977     REGEX_ASSERT(dest == ".abx..abx...abx..");
 978
 979     //
 980     // Empty source string
 981     //
 982     UnicodeString d3 = "";
 983     matcher->reset(d3);
 984     dest = matcher->replaceFirst("yz", status);
 985     REGEX_CHECK_STATUS;
 986     REGEX_ASSERT(dest == "");
 987
 988     dest = matcher->replaceAll("yz", status);
 989     REGEX_CHECK_STATUS;
 990     REGEX_ASSERT(dest == "");
 991
 992     //
 993     // Empty substitution string
 994     //
 995     matcher->reset(data);              // ".abc..abc...abc.."
 996     dest = matcher->replaceFirst("", status);
 997     REGEX_CHECK_STATUS;
 998     REGEX_ASSERT(dest == "...abc...abc..");
 999
1000     dest = matcher->replaceAll("", status);
1001     REGEX_CHECK_STATUS;
1002     REGEX_ASSERT(dest == "........");
1003
1004     //
1005     // match whole string
1006     //
1007     UnicodeString d4 = "abc";
1008     matcher->reset(d4);
1009     dest = matcher->replaceFirst("xyz", status);
1010     REGEX_CHECK_STATUS;
1011     REGEX_ASSERT(dest == "xyz");
1012
1013     dest = matcher->replaceAll("xyz", status);
1014     REGEX_CHECK_STATUS;
1015     REGEX_ASSERT(dest == "xyz");
1016
1017     //
1018     // Capture Group, simple case
1019     //
1020     UnicodeString       re2("a(..)");
1021     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1022     REGEX_CHECK_STATUS;
1023     UnicodeString d5 = "abcdefg";
1024     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1025     REGEX_CHECK_STATUS;
1026     dest = matcher2->replaceFirst("$1$1", status);
1027     REGEX_CHECK_STATUS;
1028     REGEX_ASSERT(dest == "bcbcdefg");
1029
1030     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1031     REGEX_CHECK_STATUS;
1032     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1033
1034     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1035     REGEX_CHECK_STATUS;
1036     REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
1037
1038     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1039     replacement = replacement.unescape();
1040     dest = matcher2->replaceFirst(replacement, status);
1041     REGEX_CHECK_STATUS;
1042     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1043
1044     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1045
1046
1047     //
1048     // Replacement String with \u hex escapes
1049     //
1050     {
1051         UnicodeString  src = "abc 1 abc 2 abc 3";
1052         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1053         matcher->reset(src);
1054         UnicodeString  result = matcher->replaceAll(substitute, status);
1055         REGEX_CHECK_STATUS;
1056         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1057     }
1058     {
1059         UnicodeString  src = "abc !";
1060         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1061         matcher->reset(src);
1062         UnicodeString  result = matcher->replaceAll(substitute, status);
1063         REGEX_CHECK_STATUS;
1064         UnicodeString expected = UnicodeString("--");
1065         expected.append((UChar32)0x10000);
1066         expected.append("-- !");
1067         REGEX_ASSERT(result == expected);
1068     }
1069     // TODO:  need more through testing of capture substitutions.
1070
1071     // Bug 4057
1072     //
1073     {
1074         status = U_ZERO_ERROR;
1075         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1076         RegexMatcher m("ss(.*?)ee", 0, status);
1077         REGEX_CHECK_STATUS;
1078         UnicodeString result;
1079
1080         // Multiple finds do NOT bump up the previous appendReplacement postion.
1081         m.reset(s);
1082         m.find();
1083         m.find();
1084         m.appendReplacement(result, "ooh", status);
1085         REGEX_CHECK_STATUS;
1086         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1087
1088         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1089         status = U_ZERO_ERROR;
1090         result.truncate(0);
1091         m.reset(10, status);
1092         m.find();
1093         m.find();
1094         m.appendReplacement(result, "ooh", status);
1095         REGEX_CHECK_STATUS;
1096         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1097
1098         // find() at interior of string, appendReplacemnt still starts at beginning.
1099         status = U_ZERO_ERROR;
1100         result.truncate(0);
1101         m.reset();
1102         m.find(10, status);
1103         m.find();
1104         m.appendReplacement(result, "ooh", status);
1105         REGEX_CHECK_STATUS;
1106         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1107
1108         m.appendTail(result);
1109         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1110
1111     }
1112
1113     delete matcher2;
1114     delete pat2;
1115     delete matcher;
1116     delete pat;
1117 }
1118
1119
1120 //---------------------------------------------------------------------------
1121 //
1122 //      API_Pattern       Test that the API for class RegexPattern is
1123 //                        present and nominally working.
1124 //
1125 //---------------------------------------------------------------------------
1126 void RegexTest::API_Pattern() {
1127     RegexPattern        pata;    // Test default constructor to not crash.
1128     RegexPattern        patb;
1129
1130     REGEX_ASSERT(pata == patb);
1131     REGEX_ASSERT(pata == pata);
1132
1133     UnicodeString re1("abc[a-l][m-z]");
1134     UnicodeString re2("def");
1135     UErrorCode    status = U_ZERO_ERROR;
1136     UParseError   pe;
1137
1138     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1139     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1140     REGEX_CHECK_STATUS;
1141     REGEX_ASSERT(*pat1 == *pat1);
1142     REGEX_ASSERT(*pat1 != pata);
1143
1144     // Assign
1145     patb = *pat1;
1146     REGEX_ASSERT(patb == *pat1);
1147
1148     // Copy Construct
1149     RegexPattern patc(*pat1);
1150     REGEX_ASSERT(patc == *pat1);
1151     REGEX_ASSERT(patb == patc);
1152     REGEX_ASSERT(pat1 != pat2);
1153     patb = *pat2;
1154     REGEX_ASSERT(patb != patc);
1155     REGEX_ASSERT(patb == *pat2);
1156
1157     // Compile with no flags.
1158     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1159     REGEX_ASSERT(*pat1a == *pat1);
1160
1161     REGEX_ASSERT(pat1a->flags() == 0);
1162
1163     // Compile with different flags should be not equal
1164     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1165     REGEX_CHECK_STATUS;
1166
1167     REGEX_ASSERT(*pat1b != *pat1a);
1168     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1169     REGEX_ASSERT(pat1a->flags() == 0);
1170     delete pat1b;
1171
1172     // clone
1173     RegexPattern *pat1c = pat1->clone();
1174     REGEX_ASSERT(*pat1c == *pat1);
1175     REGEX_ASSERT(*pat1c != *pat2);
1176
1177     delete pat1c;
1178     delete pat1a;
1179     delete pat1;
1180     delete pat2;
1181
1182
1183     //
1184     //   Verify that a matcher created from a cloned pattern works.
1185     //     (Jitterbug 3423)
1186     //
1187     {
1188         UErrorCode     status     = U_ZERO_ERROR;
1189         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1190         RegexPattern  *pClone     = pSource->clone();
1191         delete         pSource;
1192         RegexMatcher  *mFromClone = pClone->matcher(status);
1193         REGEX_CHECK_STATUS;
1194         UnicodeString s = "Hello World";
1195         mFromClone->reset(s);
1196         REGEX_ASSERT(mFromClone->find() == TRUE);
1197         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1198         REGEX_ASSERT(mFromClone->find() == TRUE);
1199         REGEX_ASSERT(mFromClone->group(status) == "World");
1200         REGEX_ASSERT(mFromClone->find() == FALSE);
1201         delete mFromClone;
1202         delete pClone;
1203     }
1204
1205     //
1206     //   matches convenience API
1207     //
1208     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1209     REGEX_CHECK_STATUS;
1210     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1211     REGEX_CHECK_STATUS;
1212     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1213     REGEX_CHECK_STATUS;
1214     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1215     REGEX_CHECK_STATUS;
1216     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1217     REGEX_CHECK_STATUS;
1218     status = U_INDEX_OUTOFBOUNDS_ERROR;
1219     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1220     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1221
1222
1223     //
1224     // Split()
1225     //
1226     status = U_ZERO_ERROR;
1227     pat1 = RegexPattern::compile(" +",  pe, status);
1228     REGEX_CHECK_STATUS;
1229     UnicodeString  fields[10];
1230
1231     int32_t n;
1232     n = pat1->split("Now is the time", fields, 10, status);
1233     REGEX_CHECK_STATUS;
1234     REGEX_ASSERT(n==4);
1235     REGEX_ASSERT(fields[0]=="Now");
1236     REGEX_ASSERT(fields[1]=="is");
1237     REGEX_ASSERT(fields[2]=="the");
1238     REGEX_ASSERT(fields[3]=="time");
1239     REGEX_ASSERT(fields[4]=="");
1240
1241     n = pat1->split("Now is the time", fields, 2, status);
1242     REGEX_CHECK_STATUS;
1243     REGEX_ASSERT(n==2);
1244     REGEX_ASSERT(fields[0]=="Now");
1245     REGEX_ASSERT(fields[1]=="is the time");
1246     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1247
1248     fields[1] = "*";
1249     status = U_ZERO_ERROR;
1250     n = pat1->split("Now is the time", fields, 1, status);
1251     REGEX_CHECK_STATUS;
1252     REGEX_ASSERT(n==1);
1253     REGEX_ASSERT(fields[0]=="Now is the time");
1254     REGEX_ASSERT(fields[1]=="*");
1255     status = U_ZERO_ERROR;
1256
1257     n = pat1->split("    Now       is the time   ", fields, 10, status);
1258     REGEX_CHECK_STATUS;
1259     REGEX_ASSERT(n==5);
1260     REGEX_ASSERT(fields[0]=="");
1261     REGEX_ASSERT(fields[1]=="Now");
1262     REGEX_ASSERT(fields[2]=="is");
1263     REGEX_ASSERT(fields[3]=="the");
1264     REGEX_ASSERT(fields[4]=="time");
1265     REGEX_ASSERT(fields[5]=="");
1266
1267     n = pat1->split("     ", fields, 10, status);
1268     REGEX_CHECK_STATUS;
1269     REGEX_ASSERT(n==1);
1270     REGEX_ASSERT(fields[0]=="");
1271
1272     fields[0] = "foo";
1273     n = pat1->split("", fields, 10, status);
1274     REGEX_CHECK_STATUS;
1275     REGEX_ASSERT(n==0);
1276     REGEX_ASSERT(fields[0]=="foo");
1277
1278     delete pat1;
1279
1280     //  split, with a pattern with (capture)
1281     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1282     REGEX_CHECK_STATUS;
1283
1284     status = U_ZERO_ERROR;
1285     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1286     REGEX_CHECK_STATUS;
1287     REGEX_ASSERT(n==6);
1288     REGEX_ASSERT(fields[0]=="");
1289     REGEX_ASSERT(fields[1]=="a");
1290     REGEX_ASSERT(fields[2]=="Now is ");
1291     REGEX_ASSERT(fields[3]=="b");
1292     REGEX_ASSERT(fields[4]=="the time");
1293     REGEX_ASSERT(fields[5]=="c");
1294     REGEX_ASSERT(fields[6]=="");
1295     REGEX_ASSERT(status==U_ZERO_ERROR);
1296
1297     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1298     REGEX_CHECK_STATUS;
1299     REGEX_ASSERT(n==6);
1300     REGEX_ASSERT(fields[0]=="  ");
1301     REGEX_ASSERT(fields[1]=="a");
1302     REGEX_ASSERT(fields[2]=="Now is ");
1303     REGEX_ASSERT(fields[3]=="b");
1304     REGEX_ASSERT(fields[4]=="the time");
1305     REGEX_ASSERT(fields[5]=="c");
1306     REGEX_ASSERT(fields[6]=="");
1307
1308     status = U_ZERO_ERROR;
1309     fields[6] = "foo";
1310     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1311     REGEX_CHECK_STATUS;
1312     REGEX_ASSERT(n==6);
1313     REGEX_ASSERT(fields[0]=="  ");
1314     REGEX_ASSERT(fields[1]=="a");
1315     REGEX_ASSERT(fields[2]=="Now is ");
1316     REGEX_ASSERT(fields[3]=="b");
1317     REGEX_ASSERT(fields[4]=="the time");
1318     REGEX_ASSERT(fields[5]=="c");
1319     REGEX_ASSERT(fields[6]=="foo");
1320
1321     status = U_ZERO_ERROR;
1322     fields[5] = "foo";
1323     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1324     REGEX_CHECK_STATUS;
1325     REGEX_ASSERT(n==5);
1326     REGEX_ASSERT(fields[0]=="  ");
1327     REGEX_ASSERT(fields[1]=="a");
1328     REGEX_ASSERT(fields[2]=="Now is ");
1329     REGEX_ASSERT(fields[3]=="b");
1330     REGEX_ASSERT(fields[4]=="the time<c>");
1331     REGEX_ASSERT(fields[5]=="foo");
1332
1333     status = U_ZERO_ERROR;
1334     fields[5] = "foo";
1335     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1336     REGEX_CHECK_STATUS;
1337     REGEX_ASSERT(n==5);
1338     REGEX_ASSERT(fields[0]=="  ");
1339     REGEX_ASSERT(fields[1]=="a");
1340     REGEX_ASSERT(fields[2]=="Now is ");
1341     REGEX_ASSERT(fields[3]=="b");
1342     REGEX_ASSERT(fields[4]=="the time");
1343     REGEX_ASSERT(fields[5]=="foo");
1344
1345     status = U_ZERO_ERROR;
1346     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1347     REGEX_CHECK_STATUS;
1348     REGEX_ASSERT(n==4);
1349     REGEX_ASSERT(fields[0]=="  ");
1350     REGEX_ASSERT(fields[1]=="a");
1351     REGEX_ASSERT(fields[2]=="Now is ");
1352     REGEX_ASSERT(fields[3]=="the time<c>");
1353     status = U_ZERO_ERROR;
1354     delete pat1;
1355
1356     pat1 = RegexPattern::compile("([-,])",  pe, status);
1357     REGEX_CHECK_STATUS;
1358     n = pat1->split("1-10,20", fields, 10, status);
1359     REGEX_CHECK_STATUS;
1360     REGEX_ASSERT(n==5);
1361     REGEX_ASSERT(fields[0]=="1");
1362     REGEX_ASSERT(fields[1]=="-");
1363     REGEX_ASSERT(fields[2]=="10");
1364     REGEX_ASSERT(fields[3]==",");
1365     REGEX_ASSERT(fields[4]=="20");
1366     delete pat1;
1367
1368
1369     //
1370     // RegexPattern::pattern()
1371     //
1372     pat1 = new RegexPattern();
1373     REGEX_ASSERT(pat1->pattern() == "");
1374     delete pat1;
1375
1376     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1377     REGEX_CHECK_STATUS;
1378     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1379     delete pat1;
1380
1381
1382     //
1383     // classID functions
1384     //
1385     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1386     REGEX_CHECK_STATUS;
1387     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1388     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1389     UnicodeString Hello("Hello, world.");
1390     RegexMatcher *m = pat1->matcher(Hello, status);
1391     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1392     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1393     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1394     delete m;
1395     delete pat1;
1396
1397 }
1398
1399 //---------------------------------------------------------------------------
1400 //
1401 //      Extended       A more thorough check for features of regex patterns
1402 //                     The test cases are in a separate data file,
1403 //                       source/tests/testdata/regextst.txt
1404 //                     A description of the test data format is included in that file.
1405 //
1406 //---------------------------------------------------------------------------
1407
1408 const char *
1409 RegexTest::getPath(char buffer[2048], const char *filename) {
1410     UErrorCode status=U_ZERO_ERROR;
1411     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1412     if (U_FAILURE(status)) {
1413         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
1414         return NULL;
1415     }
1416
1417     strcpy(buffer, testDataDirectory);
1418     strcat(buffer, filename);
1419     return buffer;
1420 }
1421
1422 void RegexTest::Extended() {
1423     char tdd[2048];
1424     const char *srcPath;
1425     UErrorCode  status  = U_ZERO_ERROR;
1426     int32_t     lineNum = 0;
1427
1428     //
1429     //  Open and read the test data file.
1430     //
1431     srcPath=getPath(tdd, "regextst.txt");
1432     if(srcPath==NULL) {
1433         return; /* something went wrong, error already output */
1434     }
1435
1436     int32_t    len;
1437     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
1438     if (U_FAILURE(status)) {
1439         return; /* something went wrong, error already output */
1440     }
1441
1442     //
1443     //  Put the test data into a UnicodeString
1444     //
1445     UnicodeString testString(FALSE, testData, len);
1446
1447     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
1448     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
1449     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
1450
1451     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
1452     UnicodeString   testPattern;   // The pattern for test from the test file.
1453     UnicodeString   testFlags;     // the flags   for a test.
1454     UnicodeString   matchString;   // The marked up string to be used as input
1455
1456     if (U_FAILURE(status)){
1457         dataerrln("Construct RegexMatcher() error.");
1458         delete [] testData;
1459         return;
1460     }
1461
1462     //
1463     //  Loop over the test data file, once per line.
1464     //
1465     while (lineMat.find()) {
1466         lineNum++;
1467         if (U_FAILURE(status)) {
1468             errln("line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
1469         }
1470
1471         status = U_ZERO_ERROR;
1472         UnicodeString testLine = lineMat.group(1, status);
1473         if (testLine.length() == 0) {
1474             continue;
1475         }
1476
1477         //
1478         // Parse the test line.  Skip blank and comment only lines.
1479         // Separate out the three main fields - pattern, flags, target.
1480         //
1481
1482         commentMat.reset(testLine);
1483         if (commentMat.lookingAt(status)) {
1484             // This line is a comment, or blank.
1485             continue;
1486         }
1487
1488         //
1489         //  Pull out the pattern field, remove it from the test file line.
1490         //
1491         quotedStuffMat.reset(testLine);
1492         if (quotedStuffMat.lookingAt(status)) {
1493             testPattern = quotedStuffMat.group(2, status);
1494             testLine.remove(0, quotedStuffMat.end(0, status));
1495         } else {
1496             errln("Bad pattern (missing quotes?) at test file line %d", lineNum);
1497             continue;
1498         }
1499
1500
1501         //
1502         //  Pull out the flags from the test file line.
1503         //
1504         flagsMat.reset(testLine);
1505         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
1506         testFlags = flagsMat.group(1, status);
1507         if (flagsMat.group(2, status).length() > 0) {
1508             errln("Bad Match flag at line %d. Scanning %c\n",
1509                 lineNum, flagsMat.group(2, status).charAt(0));
1510             continue;
1511         }
1512         testLine.remove(0, flagsMat.end(0, status));
1513
1514         //
1515         //  Pull out the match string, as a whole.
1516         //    We'll process the <tags> later.
1517         //
1518         quotedStuffMat.reset(testLine);
1519         if (quotedStuffMat.lookingAt(status)) {
1520             matchString = quotedStuffMat.group(2, status);
1521             testLine.remove(0, quotedStuffMat.end(0, status));
1522         } else {
1523             errln("Bad match string at test file line %d", lineNum);
1524             continue;
1525         }
1526
1527         //
1528         //  The only thing left from the input line should be an optional trailing comment.
1529         //
1530         commentMat.reset(testLine);
1531         if (commentMat.lookingAt(status) == FALSE) {
1532             errln("Line %d: unexpected characters at end of test line.", lineNum);
1533             continue;
1534         }
1535
1536         //
1537         //  Run the test
1538         //
1539         regex_find(testPattern, testFlags, matchString, lineNum);
1540     }
1541
1542     delete [] testData;
1543
1544 }
1545
1546
1547
1548 //---------------------------------------------------------------------------
1549 //
1550 //    regex_find(pattern, flags, inputString, lineNumber)
1551 //
1552 //         Function to run a single test from the Extended (data driven) tests.
1553 //         See file test/testdata/regextst.txt for a description of the
1554 //         pattern and inputString fields, and the allowed flags.
1555 //         lineNumber is the source line in regextst.txt of the test.
1556 //
1557 //---------------------------------------------------------------------------
1558
1559
1560 //  Set a value into a UVector at position specified by a decimal number in
1561 //   a UnicodeString.   This is a utility function needed by the actual test function,
1562 //   which follows.
1563 static void set(UVector &vec, int32_t val, UnicodeString index) {
1564     UErrorCode  status=U_ZERO_ERROR;
1565     int32_t  idx = 0;
1566     for (int32_t i=0; i<index.length(); i++) {
1567         int32_t d=u_charDigitValue(index.charAt(i));
1568         if (d<0) {return;}
1569         idx = idx*10 + d;
1570     }
1571     while (vec.size()<idx+1) {vec.addElement(-1, status);}
1572     vec.setElementAt(val, idx);
1573 }
1574
1575 void RegexTest::regex_find(const UnicodeString &pattern,
1576                            const UnicodeString &flags,
1577                            const UnicodeString &inputString,
1578                            int32_t line) {
1579     UnicodeString       unEscapedInput;
1580     UnicodeString       deTaggedInput;
1581
1582     UErrorCode          status         = U_ZERO_ERROR;
1583     UParseError         pe;
1584     RegexPattern        *parsePat      = NULL;
1585     RegexMatcher        *parseMatcher  = NULL;
1586     RegexPattern        *callerPattern = NULL;
1587     RegexMatcher        *matcher       = NULL;
1588     UVector             groupStarts(status);
1589     UVector             groupEnds(status);
1590     UBool               isMatch        = FALSE;
1591     UBool               failed         = FALSE;
1592     int32_t             numFinds;
1593     int32_t             i;
1594     UBool               useMatchesFunc   = FALSE;
1595     UBool               useLookingAtFunc = FALSE;
1596     int32_t             regionStart      = -1;
1597     int32_t             regionEnd        = -1;
1598
1599     //
1600     //  Compile the caller's pattern
1601     //
1602     uint32_t bflags = 0;
1603     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
1604         bflags |= UREGEX_CASE_INSENSITIVE;
1605     }
1606     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
1607         bflags |= UREGEX_COMMENTS;
1608     }
1609     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
1610         bflags |= UREGEX_DOTALL;
1611     }
1612     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
1613         bflags |= UREGEX_MULTILINE;
1614     }
1615
1616     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
1617         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
1618     }
1619     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
1620         bflags |= UREGEX_UNIX_LINES;
1621     }
1622
1623
1624     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
1625     if (status != U_ZERO_ERROR) {
1626         #if UCONFIG_NO_BREAK_ITERATION==1
1627         // 'v' test flag means that the test pattern should not compile if ICU was configured
1628         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
1629         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
1630             goto cleanupAndReturn;
1631         }
1632         #endif
1633         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
1634             // Expected pattern compilation error.
1635             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
1636                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
1637             }
1638             goto cleanupAndReturn;
1639         } else {
1640             // Unexpected pattern compilation error.
1641             errln("Line %d: error %s compiling pattern.", line, u_errorName(status));
1642             goto cleanupAndReturn;
1643         }
1644     }
1645
1646     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
1647         RegexPatternDump(callerPattern);
1648     }
1649
1650     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
1651         errln("Expected, but did not get, a pattern compilation error.");
1652         goto cleanupAndReturn;
1653     }
1654
1655
1656     //
1657     // Number of times find() should be called on the test string, default to 1
1658     //
1659     numFinds = 1;
1660     for (i=2; i<=9; i++) {
1661         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
1662             if (numFinds != 1) {
1663                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
1664                 goto cleanupAndReturn;
1665             }
1666             numFinds = i;
1667         }
1668     }
1669
1670     // 'M' flag.  Use matches() instead of find()
1671     if (flags.indexOf((UChar)0x4d) >= 0) {
1672         useMatchesFunc = TRUE;
1673     }
1674     if (flags.indexOf((UChar)0x4c) >= 0) {
1675         useLookingAtFunc = TRUE;
1676     }
1677
1678     //
1679     //  Find the tags in the input data, remove them, and record the group boundary
1680     //    positions.
1681     //
1682     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
1683     REGEX_CHECK_STATUS_L(line);
1684
1685     unEscapedInput = inputString.unescape();
1686     parseMatcher = parsePat->matcher(unEscapedInput, status);
1687     REGEX_CHECK_STATUS_L(line);
1688     while(parseMatcher->find()) {
1689         parseMatcher->appendReplacement(deTaggedInput, "", status);
1690         REGEX_CHECK_STATUS;
1691         UnicodeString groupNum = parseMatcher->group(2, status);
1692         if (groupNum == "r") {
1693             // <r> or </r>, a region specification within the string
1694             if (parseMatcher->group(1, status) == "/") {
1695                 regionEnd = deTaggedInput.length();
1696             } else {
1697                 regionStart = deTaggedInput.length();
1698             }
1699         } else {
1700             // <digits> or </digits>, a group match boundary tag.
1701             if (parseMatcher->group(1, status) == "/") {
1702                 set(groupEnds, deTaggedInput.length(), groupNum);
1703             } else {
1704                 set(groupStarts, deTaggedInput.length(), groupNum);
1705             }
1706         }
1707     }
1708     parseMatcher->appendTail(deTaggedInput);
1709     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
1710     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
1711       errln("mismatched <r> tags");
1712       failed = TRUE;
1713       goto cleanupAndReturn;
1714     }
1715
1716
1717     //
1718     //  Configure the matcher according to the flags specified with this test.
1719     //
1720     matcher = callerPattern->matcher(deTaggedInput, status);
1721     REGEX_CHECK_STATUS_L(line);
1722     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
1723         matcher->setTrace(TRUE);
1724     }
1725     if (regionStart>=0) {
1726        matcher->region(regionStart, regionEnd, status);
1727        REGEX_CHECK_STATUS_L(line);
1728     }
1729     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
1730         matcher->useAnchoringBounds(FALSE);
1731     }
1732     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
1733         matcher->useTransparentBounds(TRUE);
1734     }
1735
1736
1737
1738     //
1739     // Do a find on the de-tagged input using the caller's pattern
1740     //     TODO: error on count>1 and not find().
1741     //           error on both matches() and lookingAt().
1742     //
1743     for (i=0; i<numFinds; i++) {
1744         if (useMatchesFunc) {
1745             isMatch = matcher->matches(status);
1746         } else  if (useLookingAtFunc) {
1747             isMatch = matcher->lookingAt(status);
1748         } else {
1749             isMatch = matcher->find();
1750         }
1751     }
1752     matcher->setTrace(FALSE);
1753
1754     //
1755     // Match up the groups from the find() with the groups from the tags
1756     //
1757
1758     // number of tags should match number of groups from find operation.
1759     // matcher->groupCount does not include group 0, the entire match, hence the +1.
1760     //   G option in test means that capture group data is not available in the
1761     //     expected results, so the check needs to be suppressed.
1762     if (isMatch == FALSE && groupStarts.size() != 0) {
1763         errln("Error at line %d:  Match expected, but none found.\n", line);
1764         failed = TRUE;
1765         goto cleanupAndReturn;
1766     }
1767
1768     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
1769         // Only check for match / no match.  Don't check capture groups.
1770         if (isMatch && groupStarts.size() == 0) {
1771             errln("Error at line %d:  No match expected, but one found.\n", line);
1772             failed = TRUE;
1773         }
1774         goto cleanupAndReturn;
1775     }
1776
1777     for (i=0; i<=matcher->groupCount(); i++) {
1778         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
1779         if (matcher->start(i, status) != expectedStart) {
1780             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
1781                 line, i, expectedStart, matcher->start(i, status));
1782             failed = TRUE;
1783             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
1784         }
1785         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
1786         if (matcher->end(i, status) != expectedEnd) {
1787             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
1788                 line, i, expectedEnd, matcher->end(i, status));
1789             failed = TRUE;
1790             // Error on end position;  keep going; real error is probably yet to come as group
1791             //   end positions work from end of the input data towards the front.
1792         }
1793     }
1794     if ( matcher->groupCount()+1 < groupStarts.size()) {
1795         errln("Error at line %d: Expected %d capture groups, found %d.",
1796             line, groupStarts.size()-1, matcher->groupCount());
1797         failed = TRUE;
1798         }
1799
1800     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
1801         matcher->requireEnd() == TRUE) {
1802         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
1803         failed = TRUE;
1804     }
1805     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
1806         matcher->requireEnd() == FALSE) {
1807         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
1808         failed = TRUE;
1809     }
1810     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
1811         matcher->hitEnd() == TRUE) {
1812         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
1813         failed = TRUE;
1814     }
1815     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
1816         matcher->hitEnd() == FALSE) {
1817         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
1818         failed = TRUE;
1819     }
1820
1821
1822 cleanupAndReturn:
1823     if (failed) {
1824         errln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
1825             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
1826         // callerPattern->dump();
1827     }
1828     delete parseMatcher;
1829     delete parsePat;
1830     delete matcher;
1831     delete callerPattern;
1832 }
1833
1834
1835
1836
1837 //---------------------------------------------------------------------------
1838 //
1839 //      Errors     Check for error handling in patterns.
1840 //
1841 //---------------------------------------------------------------------------
1842 void RegexTest::Errors() {
1843     // \escape sequences that aren't implemented yet.
1844     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
1845
1846     // Missing close parentheses
1847     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
1848     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
1849     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
1850
1851     // Extra close paren
1852     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
1853     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
1854     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
1855
1856     // Look-ahead, Look-behind
1857     //  TODO:  add tests for unbounded length look-behinds.
1858     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
1859
1860     // Attempt to use non-default flags
1861     {
1862         UParseError   pe;
1863         UErrorCode    status = U_ZERO_ERROR;
1864         int32_t       flags  = UREGEX_CANON_EQ |
1865                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
1866                                UREGEX_MULTILINE;
1867         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
1868         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
1869         delete pat1;
1870     }
1871
1872
1873     // Quantifiers are allowed only after something that can be quantified.
1874     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
1875     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
1876     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
1877
1878     // Mal-formed {min,max} quantifiers
1879     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
1880     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
1881     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
1882     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
1883     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
1884     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
1885     REGEX_ERR("abc{5,50000000000}", 1, 17, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
1886     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
1887     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
1888
1889     // Ticket 5389
1890     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
1891
1892 }
1893
1894
1895 //-------------------------------------------------------------------------------
1896 //
1897 //  Read a text data file, convert it to UChars, and return the data
1898 //    in one big UChar * buffer, which the caller must delete.
1899 //
1900 //--------------------------------------------------------------------------------
1901 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
1902                                      const char *defEncoding, UErrorCode &status) {
1903     UChar       *retPtr  = NULL;
1904     char        *fileBuf = NULL;
1905     UConverter* conv     = NULL;
1906     FILE        *f       = NULL;
1907
1908     ulen = 0;
1909     if (U_FAILURE(status)) {
1910         return retPtr;
1911     }
1912
1913     //
1914     //  Open the file.
1915     //
1916     f = fopen(fileName, "rb");
1917     if (f == 0) {
1918         dataerrln("[DATA] Error opening test data file %s\n", fileName);
1919         status = U_FILE_ACCESS_ERROR;
1920         return NULL;
1921     }
1922     //
1923     //  Read it in
1924     //
1925     int32_t            fileSize;
1926     int32_t            amt_read;
1927
1928     fseek( f, 0, SEEK_END);
1929     fileSize = ftell(f);
1930     fileBuf = new char[fileSize];
1931     fseek(f, 0, SEEK_SET);
1932     amt_read = fread(fileBuf, 1, fileSize, f);
1933     if (amt_read != fileSize || fileSize <= 0) {
1934         errln("Error reading test data file.");
1935         goto cleanUpAndReturn;
1936     }
1937
1938     //
1939     // Look for a Unicode Signature (BOM) on the data just read
1940     //
1941     int32_t        signatureLength;
1942     const char *   fileBufC;
1943     const char*    encoding;
1944
1945     fileBufC = fileBuf;
1946     encoding = ucnv_detectUnicodeSignature(
1947         fileBuf, fileSize, &signatureLength, &status);
1948     if(encoding!=NULL ){
1949         fileBufC  += signatureLength;
1950         fileSize  -= signatureLength;
1951     } else {
1952         encoding = defEncoding;
1953         if (strcmp(encoding, "utf-8") == 0) {
1954             errln("file %s is missing its BOM", fileName);
1955         }
1956     }
1957
1958     //
1959     // Open a converter to take the rule file to UTF-16
1960     //
1961     conv = ucnv_open(encoding, &status);
1962     if (U_FAILURE(status)) {
1963         goto cleanUpAndReturn;
1964     }
1965
1966     //
1967     // Convert the rules to UChar.
1968     //  Preflight first to determine required buffer size.
1969     //
1970     ulen = ucnv_toUChars(conv,
1971         NULL,           //  dest,
1972         0,              //  destCapacity,
1973         fileBufC,
1974         fileSize,
1975         &status);
1976     if (status == U_BUFFER_OVERFLOW_ERROR) {
1977         // Buffer Overflow is expected from the preflight operation.
1978         status = U_ZERO_ERROR;
1979
1980         retPtr = new UChar[ulen+1];
1981         ucnv_toUChars(conv,
1982             retPtr,       //  dest,
1983             ulen+1,
1984             fileBufC,
1985             fileSize,
1986             &status);
1987     }
1988
1989 cleanUpAndReturn:
1990     fclose(f);
1991     delete[] fileBuf;
1992     ucnv_close(conv);
1993     if (U_FAILURE(status)) {
1994         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1995         delete retPtr;
1996         retPtr = 0;
1997         ulen   = 0;
1998     };
1999     return retPtr;
2000 }
2001
2002
2003 //-------------------------------------------------------------------------------
2004 //
2005 //   PerlTests  - Run Perl's regular expression tests
2006 //                The input file for this test is re_tests, the standard regular
2007 //                expression test data distributed with the Perl source code.
2008 //
2009 //                Here is Perl's description of the test data file:
2010 //
2011 //        # The tests are in a separate file 't/op/re_tests'.
2012 //        # Each line in that file is a separate test.
2013 //        # There are five columns, separated by tabs.
2014 //        #
2015 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
2016 //        # Modifiers can be put after the closing C<'>.
2017 //        #
2018 //        # Column 2 contains the string to be matched.
2019 //        #
2020 //        # Column 3 contains the expected result:
2021 //        #     y   expect a match
2022 //        #     n   expect no match
2023 //        #     c   expect an error
2024 //        # B   test exposes a known bug in Perl, should be skipped
2025 //        # b   test exposes a known bug in Perl, should be skipped if noamp
2026 //        #
2027 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
2028 //        #
2029 //        # Column 4 contains a string, usually C<$&>.
2030 //        #
2031 //        # Column 5 contains the expected result of double-quote
2032 //        # interpolating that string after the match, or start of error message.
2033 //        #
2034 //        # Column 6, if present, contains a reason why the test is skipped.
2035 //        # This is printed with "skipped", for harness to pick up.
2036 //        #
2037 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
2038 //        #
2039 //        # If you want to add a regular expression test that can't be expressed
2040 //        # in this format, don't add it here: put it in op/pat.t instead.
2041 //
2042 //        For ICU, if field 3 contains an 'i', the test will be skipped.
2043 //        The test exposes is some known incompatibility between ICU and Perl regexps.
2044 //        (The i is in addition to whatever was there before.)
2045 //
2046 //-------------------------------------------------------------------------------
2047 void RegexTest::PerlTests() {
2048     char tdd[2048];
2049     const char *srcPath;
2050     UErrorCode  status = U_ZERO_ERROR;
2051     UParseError pe;
2052
2053     //
2054     //  Open and read the test data file.
2055     //
2056     srcPath=getPath(tdd, "re_tests.txt");
2057     if(srcPath==NULL) {
2058         return; /* something went wrong, error already output */
2059     }
2060
2061     int32_t    len;
2062     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
2063     if (U_FAILURE(status)) {
2064         return; /* something went wrong, error already output */
2065     }
2066
2067     //
2068     //  Put the test data into a UnicodeString
2069     //
2070     UnicodeString testDataString(FALSE, testData, len);
2071
2072     //
2073     //  Regex to break the input file into lines, and strip the new lines.
2074     //     One line per match, capture group one is the desired data.
2075     //
2076     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
2077     if (U_FAILURE(status)) {
2078         dataerrln("RegexPattern::compile() error");
2079         return;
2080     }
2081     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
2082
2083     //
2084     //  Regex to split a test file line into fields.
2085     //    There are six fields, separated by tabs.
2086     //
2087     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
2088
2089     //
2090     //  Regex to identify test patterns with flag settings, and to separate them.
2091     //    Test patterns with flags look like 'pattern'i
2092     //    Test patterns without flags are not quoted:   pattern
2093     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
2094     //
2095     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
2096     RegexMatcher* flagMat = flagPat->matcher(status);
2097
2098     //
2099     // The Perl tests reference several perl-isms, which are evaluated/substituted
2100     //   in the test data.  Not being perl, this must be done explicitly.  Here
2101     //   are string constants and REs for these constructs.
2102     //
2103     UnicodeString nulnulSrc("${nulnul}");
2104     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
2105     nulnul = nulnul.unescape();
2106
2107     UnicodeString ffffSrc("${ffff}");
2108     UnicodeString ffff("\\uffff", -1, US_INV);
2109     ffff = ffff.unescape();
2110
2111     //  regexp for $-[0], $+[2], etc.
2112     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
2113     RegexMatcher *groupsMat = groupsPat->matcher(status);
2114
2115     //  regexp for $0, $1, $2, etc.
2116     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
2117     RegexMatcher *cgMat = cgPat->matcher(status);
2118
2119
2120     //
2121     // Main Loop for the Perl Tests, runs once per line from the
2122     //   test data file.
2123     //
2124     int32_t  lineNum = 0;
2125     int32_t  skippedUnimplementedCount = 0;
2126     while (lineMat->find()) {
2127         lineNum++;
2128
2129         //
2130         //  Get a line, break it into its fields, do the Perl
2131         //    variable substitutions.
2132         //
2133         UnicodeString line = lineMat->group(1, status);
2134         UnicodeString fields[7];
2135         fieldPat->split(line, fields, 7, status);
2136
2137         flagMat->reset(fields[0]);
2138         flagMat->matches(status);
2139         UnicodeString pattern  = flagMat->group(2, status);
2140         pattern.findAndReplace("${bang}", "!");
2141         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
2142         pattern.findAndReplace(ffffSrc, ffff);
2143
2144         //
2145         //  Identify patterns that include match flag settings,
2146         //    split off the flags, remove the extra quotes.
2147         //
2148         UnicodeString flagStr = flagMat->group(3, status);
2149         if (U_FAILURE(status)) {
2150             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2151             return;
2152         }
2153         int32_t flags = 0;
2154         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
2155         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
2156         const UChar UChar_m = 0x6d;
2157         const UChar UChar_x = 0x78;
2158         const UChar UChar_y = 0x79;
2159         if (flagStr.indexOf(UChar_i) != -1) {
2160             flags |= UREGEX_CASE_INSENSITIVE;
2161         }
2162         if (flagStr.indexOf(UChar_m) != -1) {
2163             flags |= UREGEX_MULTILINE;
2164         }
2165         if (flagStr.indexOf(UChar_x) != -1) {
2166             flags |= UREGEX_COMMENTS;
2167         }
2168
2169         //
2170         // Compile the test pattern.
2171         //
2172         status = U_ZERO_ERROR;
2173         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
2174         if (status == U_REGEX_UNIMPLEMENTED) {
2175             //
2176             // Test of a feature that is planned for ICU, but not yet implemented.
2177             //   skip the test.
2178             skippedUnimplementedCount++;
2179             delete testPat;
2180             status = U_ZERO_ERROR;
2181             continue;
2182         }
2183
2184         if (U_FAILURE(status)) {
2185             // Some tests are supposed to generate errors.
2186             //   Only report an error for tests that are supposed to succeed.
2187             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
2188                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
2189             {
2190                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
2191             }
2192             status = U_ZERO_ERROR;
2193             delete testPat;
2194             continue;
2195         }
2196
2197         if (fields[2].indexOf(UChar_i) >= 0) {
2198             // ICU should skip this test.
2199             delete testPat;
2200             continue;
2201         }
2202
2203         if (fields[2].indexOf(UChar_c) >= 0) {
2204             // This pattern should have caused a compilation error, but didn't/
2205             errln("line %d: Expected a pattern compile error, got success.", lineNum);
2206             delete testPat;
2207             continue;
2208         }
2209
2210         //
2211         // replace the Perl variables that appear in some of the
2212         //   match data strings.
2213         //
2214         UnicodeString matchString = fields[1];
2215         matchString.findAndReplace(nulnulSrc, nulnul);
2216         matchString.findAndReplace(ffffSrc,   ffff);
2217
2218         // Replace any \n in the match string with an actual new-line char.
2219         //  Don't do full unescape, as this unescapes more than Perl does, which
2220         //  causes other spurious failures in the tests.
2221         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2222
2223
2224
2225         //
2226         // Run the test, check for expected match/don't match result.
2227         //
2228         RegexMatcher *testMat = testPat->matcher(matchString, status);
2229         UBool found = testMat->find();
2230         UBool expected = FALSE;
2231         if (fields[2].indexOf(UChar_y) >=0) {
2232             expected = TRUE;
2233         }
2234         if (expected != found) {
2235             errln("line %d: Expected %smatch, got %smatch",
2236                 lineNum, expected?"":"no ", found?"":"no " );
2237             continue;
2238         }
2239
2240         // Don't try to check expected results if there is no match.
2241         //   (Some have stuff in the expected fields)
2242         if (!found) {
2243             delete testMat;
2244             delete testPat;
2245             continue;
2246         }
2247
2248         //
2249         // Interpret the Perl expression from the fourth field of the data file,
2250         // building up an ICU string from the results of the ICU match.
2251         //   The Perl expression will contain references to the results of
2252         //     a regex match, including the matched string, capture group strings,
2253         //     group starting and ending indicies, etc.
2254         //
2255         UnicodeString resultString;
2256         UnicodeString perlExpr = fields[3];
2257         groupsMat->reset(perlExpr);
2258         cgMat->reset(perlExpr);
2259
2260         while (perlExpr.length() > 0) {
2261             if (perlExpr.startsWith("$&")) {
2262                 resultString.append(testMat->group(status));
2263                 perlExpr.remove(0, 2);
2264             }
2265
2266             else if (groupsMat->lookingAt(status)) {
2267                 // $-[0]   $+[2]  etc.
2268                 UnicodeString digitString = groupsMat->group(2, status);
2269                 int32_t t = 0;
2270                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2271                 UnicodeString plusOrMinus = groupsMat->group(1, status);
2272                 int32_t matchPosition;
2273                 if (plusOrMinus.compare("+") == 0) {
2274                     matchPosition = testMat->end(groupNum, status);
2275                 } else {
2276                     matchPosition = testMat->start(groupNum, status);
2277                 }
2278                 if (matchPosition != -1) {
2279                     ICU_Utility::appendNumber(resultString, matchPosition);
2280                 }
2281                 perlExpr.remove(0, groupsMat->end(status));
2282             }
2283
2284             else if (cgMat->lookingAt(status)) {
2285                 // $1, $2, $3, etc.
2286                 UnicodeString digitString = cgMat->group(1, status);
2287                 int32_t t = 0;
2288                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
2289                 if (U_SUCCESS(status)) {
2290                     resultString.append(testMat->group(groupNum, status));
2291                     status = U_ZERO_ERROR;
2292                 }
2293                 perlExpr.remove(0, cgMat->end(status));
2294             }
2295
2296             else if (perlExpr.startsWith("@-")) {
2297                 int32_t i;
2298                 for (i=0; i<=testMat->groupCount(); i++) {
2299                     if (i>0) {
2300                         resultString.append(" ");
2301                     }
2302                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
2303                 }
2304                 perlExpr.remove(0, 2);
2305             }
2306
2307             else if (perlExpr.startsWith("@+")) {
2308                 int32_t i;
2309                 for (i=0; i<=testMat->groupCount(); i++) {
2310                     if (i>0) {
2311                         resultString.append(" ");
2312                     }
2313                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
2314                 }
2315                 perlExpr.remove(0, 2);
2316             }
2317
2318             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
2319                                                      //           or as an escaped sequence (e.g. \n)
2320                 if (perlExpr.length() > 1) {
2321                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
2322                 }
2323                 UChar c = perlExpr.charAt(0);
2324                 switch (c) {
2325                 case 'n':   c = '\n'; break;
2326                 // add any other escape sequences that show up in the test expected results.
2327                 }
2328                 resultString.append(c);
2329                 perlExpr.remove(0, 1);
2330             }
2331
2332             else  {
2333                 // Any characters from the perl expression that we don't explicitly
2334                 //  recognize before here are assumed to be literals and copied
2335                 //  as-is to the expected results.
2336                 resultString.append(perlExpr.charAt(0));
2337                 perlExpr.remove(0, 1);
2338             }
2339
2340             if (U_FAILURE(status)) {
2341                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
2342                 break;
2343             }
2344         }
2345
2346         //
2347         // Expected Results Compare
2348         //
2349         UnicodeString expectedS(fields[4]);
2350         expectedS.findAndReplace(nulnulSrc, nulnul);
2351         expectedS.findAndReplace(ffffSrc,   ffff);
2352         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
2353
2354
2355         if (expectedS.compare(resultString) != 0) {
2356             err("Line %d: Incorrect perl expression results.", lineNum);
2357             errln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
2358         }
2359
2360         delete testMat;
2361         delete testPat;
2362     }
2363
2364     //
2365     // All done.  Clean up allocated stuff.
2366     //
2367     delete cgMat;
2368     delete cgPat;
2369
2370     delete groupsMat;
2371     delete groupsPat;
2372
2373     delete flagMat;
2374     delete flagPat;
2375
2376     delete lineMat;
2377     delete linePat;
2378
2379     delete fieldPat;
2380     delete [] testData;
2381
2382
2383     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
2384
2385 }
2386
2387
2388 //
2389 //   Callbacks()    Test the callback function.
2390 //                  When set, callbacks occur periodically during matching operations,
2391 //                  giving the application code the ability to abort the operation
2392 //                  before it's normal completion.
2393 //
2394
2395 struct callBackContext {
2396     RegexTest        *test;
2397     int32_t          maxCalls;
2398     int32_t          numCalls;
2399     int32_t          lastSteps;
2400     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
2401 };
2402
2403 U_CDECL_BEGIN
2404 static UBool U_CALLCONV
2405 testCallBackFn(const void *context, int32_t steps) {
2406     callBackContext  *info = (callBackContext *)context;
2407     if (info->lastSteps+1 != steps) {
2408         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
2409     }
2410     info->lastSteps = steps;
2411     info->numCalls++;
2412     return (info->numCalls < info->maxCalls);
2413 }
2414 U_CDECL_END
2415
2416 void RegexTest::Callbacks() {
2417    {
2418         // Getter returns NULLs if no callback has been set
2419
2420         //   The variables that the getter will fill in.
2421         //   Init to non-null values so that the action of the getter can be seen.
2422         const void          *returnedContext = &returnedContext;
2423         URegexMatchCallback *returnedFn = &testCallBackFn;
2424
2425         UErrorCode status = U_ZERO_ERROR;
2426         RegexMatcher matcher("x", 0, status);
2427         REGEX_CHECK_STATUS;
2428         matcher.getMatchCallback(returnedFn, returnedContext, status);
2429         REGEX_CHECK_STATUS;
2430         REGEX_ASSERT(returnedFn == NULL);
2431         REGEX_ASSERT(returnedContext == NULL);
2432     }
2433
2434    {
2435         // Set and Get work
2436         callBackContext cbInfo = {this, 0, 0, 0};
2437         const void          *returnedContext;
2438         URegexMatchCallback *returnedFn;
2439         UErrorCode status = U_ZERO_ERROR;
2440         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
2441         REGEX_CHECK_STATUS;
2442         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
2443         REGEX_CHECK_STATUS;
2444         matcher.getMatchCallback(returnedFn, returnedContext, status);
2445         REGEX_CHECK_STATUS;
2446         REGEX_ASSERT(returnedFn == testCallBackFn);
2447         REGEX_ASSERT(returnedContext == &cbInfo);
2448
2449         // A short-running match shouldn't invoke the callback
2450         status = U_ZERO_ERROR;
2451         cbInfo.reset(1);
2452         UnicodeString s = "xxx";
2453         matcher.reset(s);
2454         REGEX_ASSERT(matcher.matches(status));
2455         REGEX_CHECK_STATUS;
2456         REGEX_ASSERT(cbInfo.numCalls == 0);
2457
2458         // A medium-length match that runs long enough to invoke the
2459         //   callback, but not so long that the callback aborts it.
2460         status = U_ZERO_ERROR;
2461         cbInfo.reset(4);
2462         s = "aaaaaaaaaaaaaaaaaaab";
2463         matcher.reset(s);
2464         REGEX_ASSERT(matcher.matches(status)==FALSE);
2465         REGEX_CHECK_STATUS;
2466         REGEX_ASSERT(cbInfo.numCalls > 0);
2467
2468         // A longer running match that the callback function will abort.
2469         status = U_ZERO_ERROR;
2470         cbInfo.reset(4);
2471         s = "aaaaaaaaaaaaaaaaaaaaaaab";
2472         matcher.reset(s);
2473         REGEX_ASSERT(matcher.matches(status)==FALSE);
2474         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
2475         REGEX_ASSERT(cbInfo.numCalls == 4);
2476     }
2477
2478
2479 }
2480
2481 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
2482