icuSources/test/intltest/rbbitst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1999-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /************************************************************************
   9 *   Date        Name        Description
  10 *   12/15/99    Madhu        Creation.
  11 *   01/12/2000  Madhu        Updated for changed API and added new tests
  12 ************************************************************************/
  13
  14 #include "unicode/utypes.h"
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <utility>
  21 #include <vector>
  22
  23 #include "unicode/brkiter.h"
  24 #include "unicode/localpointer.h"
  25 #include "unicode/numfmt.h"
  26 #include "unicode/rbbi.h"
  27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  28 #include "unicode/regex.h"
  29 #endif
  30 #include "unicode/schriter.h"
  31 #include "unicode/uchar.h"
  32 #include "unicode/utf16.h"
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uniset.h"
  35 #include "unicode/uscript.h"
  36 #include "unicode/ustring.h"
  37 #include "unicode/utext.h"
  38
  39 #include "charstr.h"
  40 #include "cmemory.h"
  41 #include "cstr.h"
  42 #include "intltest.h"
  43 #include "rbbitst.h"
  44 #include "rbbidata.h"
  45 #include "utypeinfo.h"  // for 'typeid' to work
  46 #include "uvector.h"
  47 #include "uvectr32.h"
  48
  49
  50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  51 #include "unicode/filteredbrk.h"
  52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  53
  54 #define TEST_ASSERT(x) {if (!(x)) { \
  55     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  56
  57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  58     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  59
  60 //---------------------------------------------
  61 // runIndexedTest
  62 //---------------------------------------------
  63
  64
  65 //  Note:  Before adding new tests to this file, check whether the desired test data can
  66 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  67 //         it's much less work than writing a new test, diagnostic output in the event of failures
  68 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  69 //         will run there as well, without additional effort.
  70
  71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  72 {
  73     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  74     fTestParams = params;
  75
  76     TESTCASE_AUTO_BEGIN;
  77 #if !UCONFIG_NO_FILE_IO
  78     TESTCASE_AUTO(TestBug4153072);
  79 #endif
  80 #if !UCONFIG_NO_FILE_IO
  81     TESTCASE_AUTO(TestUnicodeFiles);
  82 #endif
  83     TESTCASE_AUTO(TestGetAvailableLocales);
  84     TESTCASE_AUTO(TestGetDisplayName);
  85 #if !UCONFIG_NO_FILE_IO
  86     TESTCASE_AUTO(TestEndBehaviour);
  87     TESTCASE_AUTO(TestWordBreaks);
  88     TESTCASE_AUTO(TestWordBoundary);
  89     TESTCASE_AUTO(TestLineBreaks);
  90     TESTCASE_AUTO(TestSentBreaks);
  91     TESTCASE_AUTO(TestExtended);
  92 #endif
  93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
  94     TESTCASE_AUTO(TestMonkey);
  95 #endif
  96 #if !UCONFIG_NO_FILE_IO
  97     TESTCASE_AUTO(TestBug3818);
  98 #endif
  99     TESTCASE_AUTO(TestDebug);
 100 #if !UCONFIG_NO_FILE_IO
 101     TESTCASE_AUTO(TestBug5775);
 102 #endif
 103     TESTCASE_AUTO(TestBug9983);
 104     TESTCASE_AUTO(TestDictRules);
 105     TESTCASE_AUTO(TestBug5532);
 106     TESTCASE_AUTO(TestBug7547);
 107     TESTCASE_AUTO(TestBug12797);
 108     TESTCASE_AUTO(TestBug12918);
 109     TESTCASE_AUTO(TestBug12932);
 110     TESTCASE_AUTO(TestEmoji);
 111     TESTCASE_AUTO(TestBug12519);
 112     TESTCASE_AUTO(TestBug12677);
 113     TESTCASE_AUTO(TestTableRedundancies);
 114     TESTCASE_AUTO(TestBug13447);
 115     TESTCASE_AUTO(TestReverse);
 116     TESTCASE_AUTO(TestBug13692);
 117     TESTCASE_AUTO_END;
 118 }
 119
 120
 121 //--------------------------------------------------------------------------------------
 122 //
 123 //    RBBITest    constructor and destructor
 124 //
 125 //--------------------------------------------------------------------------------------
 126
 127 RBBITest::RBBITest() {
 128     fTestParams = NULL;
 129 }
 130
 131
 132 RBBITest::~RBBITest() {
 133 }
 134
 135
 136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 137     UErrorCode status = U_ZERO_ERROR;
 138     char name[100];
 139     printf("code    alpha extend alphanum type word sent line name\n");
 140     int nextExpectedIndex = 0;
 141     utext_setNativeIndex(tstr, 0);
 142     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
 143         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 144             printf("------------------------------------------------ %d\n", j);
 145             ++nextExpectedIndex;
 146         }
 147
 148         UChar32 c = utext_next32(tstr);
 149         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 150         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 151                            u_isUAlphabetic(c),
 152                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 153                            u_isalnum(c),
 154                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 155                                                   u_charType(c),
 156                                                   U_SHORT_PROPERTY_NAME),
 157                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 158                                                   u_getIntPropertyValue(c,
 159                                                           UCHAR_WORD_BREAK),
 160                                                   U_SHORT_PROPERTY_NAME),
 161                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 162                                    u_getIntPropertyValue(c,
 163                                            UCHAR_SENTENCE_BREAK),
 164                                    U_SHORT_PROPERTY_NAME),
 165                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 166                                    u_getIntPropertyValue(c,
 167                                            UCHAR_LINE_BREAK),
 168                                    U_SHORT_PROPERTY_NAME),
 169                            name);
 170     }
 171 }
 172
 173
 174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 175    UErrorCode status = U_ZERO_ERROR;
 176    UText *tstr = NULL;
 177    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 178    if (U_FAILURE(status)) {
 179        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 180        return;
 181     }
 182    printStringBreaks(tstr, expected, expectedCount);
 183    utext_close(tstr);
 184 }
 185
 186
 187 void RBBITest::TestBug3818() {
 188     UErrorCode  status = U_ZERO_ERROR;
 189
 190     // Four Thai words...
 191     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 192                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 193     UnicodeString  thaiStr(thaiWordData);
 194
 195     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 196     if (U_FAILURE(status) || bi == NULL) {
 197         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 198         return;
 199     }
 200     bi->setText(thaiStr);
 201
 202     int32_t  startOfSecondWord = bi->following(1);
 203     if (startOfSecondWord != 4) {
 204         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 205             __FILE__, __LINE__, startOfSecondWord);
 206     }
 207     startOfSecondWord = bi->following(0);
 208     if (startOfSecondWord != 4) {
 209         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 210             __FILE__, __LINE__, startOfSecondWord);
 211     }
 212     delete bi;
 213 }
 214
 215
 216 //---------------------------------------------
 217 //
 218 //     other tests
 219 //
 220 //---------------------------------------------
 221
 222 void RBBITest::TestGetAvailableLocales()
 223 {
 224     int32_t locCount = 0;
 225     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 226
 227     if (locCount == 0)
 228         dataerrln("getAvailableLocales() returned an empty list!");
 229     // Just make sure that it's returning good memory.
 230     int32_t i;
 231     for (i = 0; i < locCount; ++i) {
 232         logln(locList[i].getName());
 233     }
 234 }
 235
 236 //Testing the BreakIterator::getDisplayName() function
 237 void RBBITest::TestGetDisplayName()
 238 {
 239     UnicodeString   result;
 240
 241     BreakIterator::getDisplayName(Locale::getUS(), result);
 242     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 243         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 244                 + result);
 245
 246     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 247     if (result != "French (France)")
 248         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 249                 + result);
 250 }
 251 /**
 252  * Test End Behaviour
 253  * @bug 4068137
 254  */
 255 void RBBITest::TestEndBehaviour()
 256 {
 257     UErrorCode status = U_ZERO_ERROR;
 258     UnicodeString testString("boo.");
 259     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 260     if (U_FAILURE(status))
 261     {
 262         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 263         return;
 264     }
 265     wb->setText(testString);
 266
 267     if (wb->first() != 0)
 268         errln("Didn't get break at beginning of string.");
 269     if (wb->next() != 3)
 270         errln("Didn't get break before period in \"boo.\"");
 271     if (wb->current() != 4 && wb->next() != 4)
 272         errln("Didn't get break at end of string.");
 273     delete wb;
 274 }
 275 /*
 276  * @bug 4153072
 277  */
 278 void RBBITest::TestBug4153072() {
 279     UErrorCode status = U_ZERO_ERROR;
 280     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 281     if (U_FAILURE(status))
 282     {
 283         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 284         return;
 285     }
 286     UnicodeString str("...Hello, World!...");
 287     int32_t begin = 3;
 288     int32_t end = str.length() - 3;
 289     UBool onBoundary;
 290
 291     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 292     iter->adoptText(textIterator);
 293     int index;
 294     // Note: with the switch to UText, there is no way to restrict the
 295     //       iteration range to begin at an index other than zero.
 296     //       String character iterators created with a non-zero bound are
 297     //         treated by RBBI as being empty.
 298     for (index = -1; index < begin + 1; ++index) {
 299         onBoundary = iter->isBoundary(index);
 300         if (index == 0?  !onBoundary : onBoundary) {
 301             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 302                             " and begin index = " + begin);
 303         }
 304     }
 305     delete iter;
 306 }
 307
 308
 309 //
 310 // Test for problem reported by Ashok Matoria on 9 July 2007
 311 //    One.<kSoftHyphen><kSpace>Two.
 312 //
 313 //    Sentence break at start (0) and then on calling next() it breaks at
 314 //   'T' of "Two". Now, at this point if I do next() and
 315 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 316 //
 317 void RBBITest::TestBug5775() {
 318     UErrorCode status = U_ZERO_ERROR;
 319     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 320     TEST_ASSERT_SUCCESS(status);
 321     if (U_FAILURE(status)) {
 322         return;
 323     }
 324 // Check for status first for better handling of no data errors.
 325     TEST_ASSERT(bi != NULL);
 326     if (bi == NULL) {
 327         return;
 328     }
 329
 330     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 331     //               01234      56789
 332     s = s.unescape();
 333     bi->setText(s);
 334     int pos = bi->next();
 335     TEST_ASSERT(pos == 6);
 336     pos = bi->next();
 337     TEST_ASSERT(pos == 10);
 338     pos = bi->previous();
 339     TEST_ASSERT(pos == 6);
 340     delete bi;
 341 }
 342
 343
 344
 345 //------------------------------------------------------------------------------
 346 //
 347 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 348 //
 349 //------------------------------------------------------------------------------
 350
 351 struct TestParams {
 352     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 353                                            //   Changed out whenever test data changes break type.
 354
 355     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 356     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 357     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 358     UVector32       *srcCol;
 359
 360     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 361     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 362     CharString       utf8String;           // UTF-8 form of text to break.
 363
 364     TestParams(UErrorCode &status) : dataToBreak() {
 365         bi               = NULL;
 366         expectedBreaks   = new UVector32(status);
 367         srcLine          = new UVector32(status);
 368         srcCol           = new UVector32(status);
 369         textToBreak      = NULL;
 370         textMap          = new UVector32(status);
 371     }
 372
 373     ~TestParams() {
 374         delete bi;
 375         delete expectedBreaks;
 376         delete srcLine;
 377         delete srcCol;
 378         utext_close(textToBreak);
 379         delete textMap;
 380     }
 381
 382     int32_t getSrcLine(int32_t bp);
 383     int32_t getExpectedBreak(int32_t bp);
 384     int32_t getSrcCol(int32_t bp);
 385
 386     void setUTF16(UErrorCode &status);
 387     void setUTF8(UErrorCode &status);
 388 };
 389
 390 // Append a UnicodeString to a CharString with UTF-8 encoding.
 391 // Substitute any invalid chars.
 392 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 394     if (U_FAILURE(status)) {
 395         return;
 396     }
 397     int32_t utf8Length;
 398     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 399                        src.getBuffer(), src.length(),   // UTF-16 data
 400                        0xfffd, NULL,                    // Substitution char, number of subs.
 401                        &status);
 402     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 403         return;
 404     }
 405     status = U_ZERO_ERROR;
 406     int32_t capacity;
 407     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 408     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 409                        src.getBuffer(), src.length(),
 410                        0xfffd, NULL, &status);
 411     dest.append(buffer, utf8Length, status);
 412 }
 413
 414
 415 void TestParams::setUTF16(UErrorCode &status) {
 416     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 417     textMap->removeAllElements();
 418     for (int32_t i=0; i<dataToBreak.length(); i++) {
 419         if (i == dataToBreak.getChar32Start(i)) {
 420             textMap->addElement(i, status);
 421         } else {
 422             textMap->addElement(-1, status);
 423         }
 424     }
 425     textMap->addElement(dataToBreak.length(), status);
 426     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 427 }
 428
 429
 430 void TestParams::setUTF8(UErrorCode &status) {
 431     if (U_FAILURE(status)) {
 432         return;
 433     }
 434     utf8String.clear();
 435     CharStringAppend(utf8String, dataToBreak, status);
 436     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 437     if (U_FAILURE(status)) {
 438         return;
 439     }
 440
 441     textMap->removeAllElements();
 442     int32_t utf16Index = 0;
 443     for (;;) {
 444         textMap->addElement(utf16Index, status);
 445         UChar32 c32 = utext_current32(textToBreak);
 446         if (c32 < 0) {
 447             break;
 448         }
 449         utf16Index += U16_LENGTH(c32);
 450         utext_next32(textToBreak);
 451         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 452             textMap->addElement(-1, status);
 453         }
 454     }
 455     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 456 }
 457
 458
 459 int32_t TestParams::getSrcLine(int32_t bp) {
 460     if (bp >= textMap->size()) {
 461         bp = textMap->size() - 1;
 462     }
 463     int32_t i = 0;
 464     for(; bp >= 0 ; --bp) {
 465         // Move to a character boundary if we are not on one already.
 466         i = textMap->elementAti(bp);
 467         if (i >= 0) {
 468             break;
 469         }
 470     }
 471     return srcLine->elementAti(i);
 472 }
 473
 474
 475 int32_t TestParams::getExpectedBreak(int32_t bp) {
 476     if (bp >= textMap->size()) {
 477         return 0;
 478     }
 479     int32_t i = textMap->elementAti(bp);
 480     int32_t retVal = 0;
 481     if (i >= 0) {
 482         retVal = expectedBreaks->elementAti(i);
 483     }
 484     return retVal;
 485 }
 486
 487
 488 int32_t TestParams::getSrcCol(int32_t bp) {
 489     if (bp >= textMap->size()) {
 490         bp = textMap->size() - 1;
 491     }
 492     int32_t i = 0;
 493     for(; bp >= 0; --bp) {
 494         // Move bp to a character boundary if we are not on one already.
 495         i = textMap->elementAti(bp);
 496         if (i >= 0) {
 497             break;
 498         }
 499     }
 500     return srcCol->elementAti(i);
 501 }
 502
 503
 504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 505     int32_t    bp;
 506     int32_t    prevBP;
 507     int32_t    i;
 508
 509     TEST_ASSERT_SUCCESS(status);
 510     if (U_FAILURE(status)) {
 511         return;
 512     }
 513
 514     if (t->bi == NULL) {
 515         return;
 516     }
 517
 518     t->bi->setText(t->textToBreak, status);
 519     //
 520     //  Run the iterator forward
 521     //
 522     prevBP = -1;
 523     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 524         if (prevBP ==  bp) {
 525             // Fail for lack of forward progress.
 526             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 527                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 528             break;
 529         }
 530
 531         // Check that there we didn't miss an expected break between the last one
 532         //  and this one.
 533         for (i=prevBP+1; i<bp; i++) {
 534             if (t->getExpectedBreak(i) != 0) {
 535                 int expected[] = {0, i};
 536                 printStringBreaks(t->dataToBreak, expected, 2);
 537                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 538                       i, t->getSrcLine(i), t->getSrcCol(i));
 539             }
 540         }
 541
 542         // Check that the break we did find was expected
 543         if (t->getExpectedBreak(bp) == 0) {
 544             int expected[] = {0, bp};
 545             printStringBreaks(t->textToBreak, expected, 2);
 546             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 547                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 548         } else {
 549             // The break was expected.
 550             //   Check that the {nnn} tag value is correct.
 551             int32_t expectedTagVal = t->getExpectedBreak(bp);
 552             if (expectedTagVal == -1) {
 553                 expectedTagVal = 0;
 554             }
 555             int32_t line = t->getSrcLine(bp);
 556             int32_t rs = t->bi->getRuleStatus();
 557             if (rs != expectedTagVal) {
 558                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 559                       "          Actual, Expected status = %4d, %4d",
 560                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 561             }
 562         }
 563
 564         prevBP = bp;
 565     }
 566
 567     // Verify that there were no missed expected breaks after the last one found
 568     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
 569         if (t->getExpectedBreak(i) != 0) {
 570             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 571                       i, t->getSrcLine(i), t->getSrcCol(i));
 572         }
 573     }
 574
 575     //
 576     //  Run the iterator backwards, verify that the same breaks are found.
 577     //
 578     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
 579     bp = t->bi->last();
 580     while (bp != BreakIterator::DONE) {
 581         if (prevBP ==  bp) {
 582             // Fail for lack of progress.
 583             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 584                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 585             break;
 586         }
 587
 588         // Check that we didn't miss an expected break between the last one
 589         //  and this one.  (UVector returns zeros for index out of bounds.)
 590         for (i=prevBP-1; i>bp; i--) {
 591             if (t->getExpectedBreak(i) != 0) {
 592                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 593                       i, t->getSrcLine(i), t->getSrcCol(i));
 594             }
 595         }
 596
 597         // Check that the break we did find was expected
 598         if (t->getExpectedBreak(bp) == 0) {
 599             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 600                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
 601         } else {
 602             // The break was expected.
 603             //   Check that the {nnn} tag value is correct.
 604             int32_t expectedTagVal = t->getExpectedBreak(bp);
 605             if (expectedTagVal == -1) {
 606                 expectedTagVal = 0;
 607             }
 608             int line = t->getSrcLine(bp);
 609             int32_t rs = t->bi->getRuleStatus();
 610             if (rs != expectedTagVal) {
 611                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
 612                       "          Actual, Expected status = %4d, %4d",
 613                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 614             }
 615         }
 616
 617         prevBP = bp;
 618         bp = t->bi->previous();
 619     }
 620
 621     // Verify that there were no missed breaks prior to the last one found
 622     for (i=prevBP-1; i>=0; i--) {
 623         if (t->getExpectedBreak(i) != 0) {
 624             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 625                       i, t->getSrcLine(i), t->getSrcCol(i));
 626         }
 627     }
 628
 629     // Check isBoundary()
 630     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
 631         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
 632         UBool boundaryFound    = t->bi->isBoundary(i);
 633         if (boundaryExpected != boundaryFound) {
 634             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
 635                   "        Expected, Actual= %s, %s",
 636                   i, t->getSrcLine(i), t->getSrcCol(i),
 637                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
 638         }
 639     }
 640
 641     // Check following()
 642     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
 643         int32_t actualBreak = t->bi->following(i);
 644         int32_t expectedBreak = BreakIterator::DONE;
 645         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
 646             if (t->getExpectedBreak(j) != 0) {
 647                 expectedBreak = j;
 648                 break;
 649             }
 650         }
 651         if (expectedBreak != actualBreak) {
 652             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
 653                   "        Expected, Actual= %d, %d",
 654                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 655         }
 656     }
 657
 658     // Check preceding()
 659     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
 660         int32_t actualBreak = t->bi->preceding(i);
 661         int32_t expectedBreak = BreakIterator::DONE;
 662
 663         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
 664         // preceding(trailing byte) will return the index of some preceding code point,
 665         // not the lead byte of the current code point, even though that has a smaller index.
 666         // Therefore, start looking at the expected break data not at i-1, but at
 667         // the start of code point index - 1.
 668         utext_setNativeIndex(t->textToBreak, i);
 669         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
 670         for (; j >= 0; j--) {
 671             if (t->getExpectedBreak(j) != 0) {
 672                 expectedBreak = j;
 673                 break;
 674             }
 675         }
 676         if (expectedBreak != actualBreak) {
 677             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
 678                   "        Expected, Actual= %d, %d",
 679                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 680         }
 681     }
 682 }
 683
 684
 685 void RBBITest::TestExtended() {
 686   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
 687   // data driven test closely entangles filtered and regular data.
 688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
 689     UErrorCode      status  = U_ZERO_ERROR;
 690     Locale          locale("");
 691
 692     TestParams          tp(status);
 693
 694     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
 695     if (U_FAILURE(status)) {
 696         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
 697     }
 698
 699     //
 700     //  Open and read the test data file.
 701     //
 702     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 703     CharString testFileName(testDataDirectory, -1, status);
 704     testFileName.append("rbbitst.txt", -1, status);
 705
 706     int    len;
 707     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
 708     if (U_FAILURE(status)) {
 709         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
 710         return;
 711     }
 712
 713     bool skipTest = false; // Skip this test?
 714
 715     //
 716     //  Put the test data into a UnicodeString
 717     //
 718     UnicodeString testString(FALSE, testFile, len);
 719
 720     enum EParseState{
 721         PARSE_COMMENT,
 722         PARSE_TAG,
 723         PARSE_DATA,
 724         PARSE_NUM,
 725         PARSE_RULES
 726     }
 727     parseState = PARSE_TAG;
 728
 729     EParseState savedState = PARSE_TAG;
 730
 731     int32_t    lineNum  = 1;
 732     int32_t    colStart = 0;
 733     int32_t    column   = 0;
 734     int32_t    charIdx  = 0;
 735
 736     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
 737
 738     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
 739     int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
 740
 741     for (charIdx = 0; charIdx < len; ) {
 742         status = U_ZERO_ERROR;
 743         UChar  c = testString.charAt(charIdx);
 744         charIdx++;
 745         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
 746             // treat CRLF as a unit
 747             c = u'\n';
 748             charIdx++;
 749         }
 750         if (c == u'\n' || c == u'\r') {
 751             lineNum++;
 752             colStart = charIdx;
 753         }
 754         column = charIdx - colStart + 1;
 755
 756         switch (parseState) {
 757         case PARSE_COMMENT:
 758             if (c == u'\n' || c == u'\r') {
 759                 parseState = savedState;
 760             }
 761             break;
 762
 763         case PARSE_TAG:
 764             {
 765             if (c == u'#') {
 766                 parseState = PARSE_COMMENT;
 767                 savedState = PARSE_TAG;
 768                 break;
 769             }
 770             if (u_isUWhiteSpace(c)) {
 771                 break;
 772             }
 773             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
 774                 delete tp.bi;
 775                 tp.bi = BreakIterator::createWordInstance(locale,  status);
 776                 skipTest = false;
 777                 charIdx += 5;
 778                 break;
 779             }
 780             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
 781                 delete tp.bi;
 782                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
 783                 skipTest = false;
 784                 charIdx += 5;
 785                 break;
 786             }
 787             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
 788                 delete tp.bi;
 789                 tp.bi = BreakIterator::createLineInstance(locale,  status);
 790                 skipTest = false;
 791                 charIdx += 5;
 792                 break;
 793             }
 794             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
 795                 delete tp.bi;
 796                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
 797                 skipTest = false;
 798                 charIdx += 5;
 799                 break;
 800             }
 801             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
 802                 delete tp.bi;
 803                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
 804                 charIdx += 6;
 805                 break;
 806             }
 807
 808             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
 809                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
 810                 charIdx = testString.indexOf(u'>', charIdx) + 1;
 811                 parseState = PARSE_RULES;
 812                 rules.remove();
 813                 rulesFirstLine = lineNum;
 814                 break;
 815             }
 816
 817             // <locale  loc_name>
 818             localeMatcher.reset(testString);
 819             if (localeMatcher.lookingAt(charIdx-1, status)) {
 820                 UnicodeString localeName = localeMatcher.group(1, status);
 821                 char localeName8[100];
 822                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
 823                 locale = Locale::createFromName(localeName8);
 824                 charIdx += localeMatcher.group(0, status).length() - 1;
 825                 TEST_ASSERT_SUCCESS(status);
 826                 break;
 827             }
 828             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
 829                 parseState = PARSE_DATA;
 830                 charIdx += 5;
 831                 tp.dataToBreak = "";
 832                 tp.expectedBreaks->removeAllElements();
 833                 tp.srcCol ->removeAllElements();
 834                 tp.srcLine->removeAllElements();
 835                 break;
 836             }
 837
 838             errln("line %d: Tag expected in test file.", lineNum);
 839             parseState = PARSE_COMMENT;
 840             savedState = PARSE_DATA;
 841             goto end_test; // Stop the test.
 842             }
 843             break;
 844
 845         case PARSE_RULES:
 846             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
 847                 charIdx += 7;
 848                 parseState = PARSE_TAG;
 849                 delete tp.bi;
 850                 UParseError pe;
 851                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
 852                 skipTest = U_FAILURE(status);
 853                 if (U_FAILURE(status)) {
 854                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
 855                         rulesFirstLine + pe.line - 1, u_errorName(status));
 856                 }
 857             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
 858                 charIdx += 10;
 859                 parseState = PARSE_TAG;
 860                 UErrorCode ec = U_ZERO_ERROR;
 861                 UParseError pe;
 862                 RuleBasedBreakIterator bi(rules, pe, ec);
 863                 if (U_SUCCESS(ec)) {
 864                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
 865                         rulesFirstLine + pe.line - 1);
 866                 }
 867             } else {
 868                 rules.append(c);
 869             }
 870             break;
 871
 872         case PARSE_DATA:
 873             if (c == u'\u2022') { // u'•'
 874                 int32_t  breakIdx = tp.dataToBreak.length();
 875                 tp.expectedBreaks->setSize(breakIdx+1);
 876                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 877                 tp.srcLine->setSize(breakIdx+1);
 878                 tp.srcLine->setElementAt(lineNum, breakIdx);
 879                 tp.srcCol ->setSize(breakIdx+1);
 880                 tp.srcCol ->setElementAt(column, breakIdx);
 881                 break;
 882             }
 883
 884             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
 885                 // Add final entry to mappings from break location to source file position.
 886                 //  Need one extra because last break position returned is after the
 887                 //    last char in the data, not at the last char.
 888                 tp.srcLine->addElement(lineNum, status);
 889                 tp.srcCol ->addElement(column, status);
 890
 891                 parseState = PARSE_TAG;
 892                 charIdx += 6;
 893
 894                 if (!skipTest) {
 895                     // RUN THE TEST!
 896                     status = U_ZERO_ERROR;
 897                     tp.setUTF16(status);
 898                     executeTest(&tp, status);
 899                     TEST_ASSERT_SUCCESS(status);
 900
 901                     // Run again, this time with UTF-8 text wrapped in a UText.
 902                     status = U_ZERO_ERROR;
 903                     tp.setUTF8(status);
 904                     TEST_ASSERT_SUCCESS(status);
 905                     executeTest(&tp, status);
 906                 }
 907                 break;
 908             }
 909
 910             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
 911                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
 912                 // Get the code point from the name and insert it into the test data.
 913                 //   (Damn, no API takes names in Unicode  !!!
 914                 //    we've got to take it back to char *)
 915                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
 916                 int32_t nameLength = nameEndIdx - (charIdx+2);
 917                 char charNameBuf[200];
 918                 UChar32 theChar = -1;
 919                 if (nameEndIdx != -1) {
 920                     UErrorCode status = U_ZERO_ERROR;
 921                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
 922                     charNameBuf[sizeof(charNameBuf)-1] = 0;
 923                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
 924                     if (U_FAILURE(status)) {
 925                         theChar = -1;
 926                     }
 927                 }
 928                 if (theChar == -1) {
 929                     errln("Error in named character in test file at line %d, col %d",
 930                         lineNum, column);
 931                 } else {
 932                     // Named code point was recognized.  Insert it
 933                     //   into the test data.
 934                     tp.dataToBreak.append(theChar);
 935                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
 936                         tp.srcLine->addElement(lineNum, status);
 937                         tp.srcCol ->addElement(column, status);
 938                     }
 939                 }
 940                 if (nameEndIdx > charIdx) {
 941                     charIdx = nameEndIdx+1;
 942
 943                 }
 944                 break;
 945             }
 946
 947
 948
 949             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
 950                 charIdx++;
 951                 int32_t  breakIdx = tp.dataToBreak.length();
 952                 tp.expectedBreaks->setSize(breakIdx+1);
 953                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 954                 tp.srcLine->setSize(breakIdx+1);
 955                 tp.srcLine->setElementAt(lineNum, breakIdx);
 956                 tp.srcCol ->setSize(breakIdx+1);
 957                 tp.srcCol ->setElementAt(column, breakIdx);
 958                 break;
 959             }
 960
 961             if (c == u'<') {
 962                 tagValue   = 0;
 963                 parseState = PARSE_NUM;
 964                 break;
 965             }
 966
 967             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
 968                 parseState = PARSE_COMMENT;
 969                 savedState = PARSE_DATA;
 970                 break;
 971             }
 972
 973             if (c == u'\\') {
 974                 // Check for \ at end of line, a line continuation.
 975                 //     Advance over (discard) the newline
 976                 UChar32 cp = testString.char32At(charIdx);
 977                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
 978                     // We have a CR LF
 979                     //  Need an extra increment of the input ptr to move over both of them
 980                     charIdx++;
 981                 }
 982                 if (cp == u'\n' || cp == u'\r') {
 983                     lineNum++;
 984                     colStart = charIdx;
 985                     charIdx++;
 986                     break;
 987                 }
 988
 989                 // Let unescape handle the back slash.
 990                 cp = testString.unescapeAt(charIdx);
 991                 if (cp != -1) {
 992                     // Escape sequence was recognized.  Insert the char
 993                     //   into the test data.
 994                     tp.dataToBreak.append(cp);
 995                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
 996                         tp.srcLine->addElement(lineNum, status);
 997                         tp.srcCol ->addElement(column, status);
 998                     }
 999                     break;
1000                 }
1001
1002
1003                 // Not a recognized backslash escape sequence.
1004                 // Take the next char as a literal.
1005                 //  TODO:  Should this be an error?
1006                 c = testString.charAt(charIdx);
1007                 charIdx = testString.moveIndex32(charIdx, 1);
1008             }
1009
1010             // Normal, non-escaped data char.
1011             tp.dataToBreak.append(c);
1012
1013             // Save the mapping from offset in the data to line/column numbers in
1014             //   the original input file.  Will be used for better error messages only.
1015             //   If there's an expected break before this char, the slot in the mapping
1016             //     vector will already be set for this char; don't overwrite it.
1017             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018                 tp.srcLine->addElement(lineNum, status);
1019                 tp.srcCol ->addElement(column, status);
1020             }
1021             break;
1022
1023
1024         case PARSE_NUM:
1025             // We are parsing an expected numeric tag value, like <1234>,
1026             //   within a chunk of data.
1027             if (u_isUWhiteSpace(c)) {
1028                 break;
1029             }
1030
1031             if (c == u'>') {
1032                 // Finished the number.  Add the info to the expected break data,
1033                 //   and switch parse state back to doing plain data.
1034                 parseState = PARSE_DATA;
1035                 if (tagValue == 0) {
1036                     tagValue = -1;
1037                 }
1038                 int32_t  breakIdx = tp.dataToBreak.length();
1039                 tp.expectedBreaks->setSize(breakIdx+1);
1040                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041                 tp.srcLine->setSize(breakIdx+1);
1042                 tp.srcLine->setElementAt(lineNum, breakIdx);
1043                 tp.srcCol ->setSize(breakIdx+1);
1044                 tp.srcCol ->setElementAt(column, breakIdx);
1045                 break;
1046             }
1047
1048             if (u_isdigit(c)) {
1049                 tagValue = tagValue*10 + u_charDigitValue(c);
1050                 break;
1051             }
1052
1053             errln("Syntax Error in test file at line %d, col %d",
1054                 lineNum, column);
1055             parseState = PARSE_COMMENT;
1056             goto end_test; // Stop the test
1057             break;
1058         }
1059
1060
1061         if (U_FAILURE(status)) {
1062             dataerrln("ICU Error %s while parsing test file at line %d.",
1063                 u_errorName(status), lineNum);
1064             status = U_ZERO_ERROR;
1065             goto end_test; // Stop the test
1066         }
1067
1068     }
1069
1070     // Reached end of test file. Raise an error if parseState indicates that we are
1071     //   within a block that should have been terminated.
1072
1073     if (parseState == PARSE_RULES) {
1074         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075             lineNum, rulesFirstLine);
1076     }
1077     if (parseState == PARSE_DATA) {
1078         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079     }
1080
1081
1082 end_test:
1083     delete [] testFile;
1084 #endif
1085 }
1086
1087
1088 //-------------------------------------------------------------------------------
1089 //
1090 //  TestDictRules   create a break iterator from source rules that includes a
1091 //                  dictionary range.   Regression for bug #7130.  Source rules
1092 //                  do not declare a break iterator type (word, line, sentence, etc.
1093 //                  but the dictionary code, without a type, would loop.
1094 //
1095 //-------------------------------------------------------------------------------
1096 void RBBITest::TestDictRules() {
1097     const char *rules =  "$dictionary = [a-z]; \n"
1098                          "!!forward; \n"
1099                          "$dictionary $dictionary; \n"
1100                          "!!reverse; \n"
1101                          "$dictionary $dictionary; \n";
1102     const char *text = "aa";
1103     UErrorCode status = U_ZERO_ERROR;
1104     UParseError parseError;
1105
1106     RuleBasedBreakIterator bi(rules, parseError, status);
1107     if (U_SUCCESS(status)) {
1108         UnicodeString utext = text;
1109         bi.setText(utext);
1110         int32_t position;
1111         int32_t loops;
1112         for (loops = 0; loops<10; loops++) {
1113             position = bi.next();
1114             if (position == RuleBasedBreakIterator::DONE) {
1115                 break;
1116             }
1117         }
1118         TEST_ASSERT(loops == 1);
1119     } else {
1120         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121     }
1122 }
1123
1124
1125
1126 //-------------------------------------------------------------------------------
1127 //
1128 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1129 //    return the data in one big UChar * buffer, which the caller must delete.
1130 //
1131 //    parameters:
1132 //          fileName:   the name of the file, with no directory part.  The test data directory
1133 //                      is assumed.
1134 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1135 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1136 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1137 //                      Pass NULL for the system default encoding.
1138 //          status
1139 //    returns:
1140 //                      The file data, converted to UChar.
1141 //                      The caller must delete this when done with
1142 //                           delete [] theBuffer;
1143 //
1144 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1145 //           Move this function to some common place.
1146 //
1147 //--------------------------------------------------------------------------------
1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1149     UChar       *retPtr  = NULL;
1150     char        *fileBuf = NULL;
1151     UConverter* conv     = NULL;
1152     FILE        *f       = NULL;
1153
1154     ulen = 0;
1155     if (U_FAILURE(status)) {
1156         return retPtr;
1157     }
1158
1159     //
1160     //  Open the file.
1161     //
1162     f = fopen(fileName, "rb");
1163     if (f == 0) {
1164         dataerrln("Error opening test data file %s\n", fileName);
1165         status = U_FILE_ACCESS_ERROR;
1166         return NULL;
1167     }
1168     //
1169     //  Read it in
1170     //
1171     int   fileSize;
1172     int   amt_read;
1173
1174     fseek( f, 0, SEEK_END);
1175     fileSize = ftell(f);
1176     fileBuf = new char[fileSize];
1177     fseek(f, 0, SEEK_SET);
1178     amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1179     if (amt_read != fileSize || fileSize <= 0) {
1180         errln("Error reading test data file.");
1181         goto cleanUpAndReturn;
1182     }
1183
1184     //
1185     // Look for a Unicode Signature (BOM) on the data just read
1186     //
1187     int32_t        signatureLength;
1188     const char *   fileBufC;
1189     const char*    bomEncoding;
1190
1191     fileBufC = fileBuf;
1192     bomEncoding = ucnv_detectUnicodeSignature(
1193         fileBuf, fileSize, &signatureLength, &status);
1194     if(bomEncoding!=NULL ){
1195         fileBufC  += signatureLength;
1196         fileSize  -= signatureLength;
1197         encoding = bomEncoding;
1198     }
1199
1200     //
1201     // Open a converter to take the rule file to UTF-16
1202     //
1203     conv = ucnv_open(encoding, &status);
1204     if (U_FAILURE(status)) {
1205         goto cleanUpAndReturn;
1206     }
1207
1208     //
1209     // Convert the rules to UChar.
1210     //  Preflight first to determine required buffer size.
1211     //
1212     ulen = ucnv_toUChars(conv,
1213         NULL,           //  dest,
1214         0,              //  destCapacity,
1215         fileBufC,
1216         fileSize,
1217         &status);
1218     if (status == U_BUFFER_OVERFLOW_ERROR) {
1219         // Buffer Overflow is expected from the preflight operation.
1220         status = U_ZERO_ERROR;
1221
1222         retPtr = new UChar[ulen+1];
1223         ucnv_toUChars(conv,
1224             retPtr,       //  dest,
1225             ulen+1,
1226             fileBufC,
1227             fileSize,
1228             &status);
1229     }
1230
1231 cleanUpAndReturn:
1232     fclose(f);
1233     delete []fileBuf;
1234     ucnv_close(conv);
1235     if (U_FAILURE(status)) {
1236         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1237         delete []retPtr;
1238         retPtr = 0;
1239         ulen   = 0;
1240     };
1241     return retPtr;
1242 }
1243
1244
1245
1246 //--------------------------------------------------------------------------------------------
1247 //
1248 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1249 //
1250 //-------------------------------------------------------------------------------------------
1251 void RBBITest::TestUnicodeFiles() {
1252     RuleBasedBreakIterator  *bi;
1253     UErrorCode               status = U_ZERO_ERROR;
1254
1255     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1256     TEST_ASSERT_SUCCESS(status);
1257     if (U_SUCCESS(status)) {
1258         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259     }
1260     delete bi;
1261
1262     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1263     TEST_ASSERT_SUCCESS(status);
1264     if (U_SUCCESS(status)) {
1265         runUnicodeTestData("WordBreakTest.txt", bi);
1266     }
1267     delete bi;
1268
1269     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1270     TEST_ASSERT_SUCCESS(status);
1271     if (U_SUCCESS(status)) {
1272         runUnicodeTestData("SentenceBreakTest.txt", bi);
1273     }
1274     delete bi;
1275
1276     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1277     TEST_ASSERT_SUCCESS(status);
1278     if (U_SUCCESS(status)) {
1279         runUnicodeTestData("LineBreakTest.txt", bi);
1280     }
1281     delete bi;
1282 }
1283
1284
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped as known issues because ICU does not fully implement
1287 // the Unicode specifications, or because ICU includes tailorings that differ from
1288 // the Unicode standard.
1289 //
1290 // Test cases are identified by the test data sequence, which tends to be more stable
1291 // across Unicode versions than the test file line numbers.
1292 //
1293 // The test case with ticket "10666" is a dummy, included as an example.
1294
1295 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1296     static struct TestCase {
1297         const char *fTicketNum;
1298         const char *fFileName;
1299         const UChar *fString;
1300     } badTestCases[] = {
1301         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1302         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1303         // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1304         // ICU is out of sync with Unicode.
1305         {"8151",  "LineBreakTest.txt", u"-#"},
1306         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1307         {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
1308         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1309         {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
1310         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1311         {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
1312         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1313
1314         // Issue ICU-12017 Improve line break around numbers
1315         {"12017", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1316         {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1317         {"12017", "LineBreakTest.txt", u"find .com"},
1318         {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1319         {"12017", "LineBreakTest.txt", u"a.2 "},
1320         {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1321         {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1322         {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1323         {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1324         {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1325         {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1326         {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1327         {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1328         {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1329         {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1330     };
1331
1332     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1333         const TestCase &badCase = badTestCases[n];
1334         if (!strcmp(fileName, badCase.fFileName) &&
1335                 testCase == UnicodeString(badCase.fString)) {
1336             return logKnownIssue(badCase.fTicketNum);
1337         }
1338     }
1339     return FALSE;
1340 }
1341
1342
1343 //--------------------------------------------------------------------------------------------
1344 //
1345 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1346 //
1347 //-------------------------------------------------------------------------------------------
1348 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1349 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1350     UErrorCode  status = U_ZERO_ERROR;
1351
1352     //
1353     //  Open and read the test data file, put it into a UnicodeString.
1354     //
1355     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1356     char testFileName[1000];
1357     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1358         dataerrln("Can't open test data.  Path too long.");
1359         return;
1360     }
1361     strcpy(testFileName, testDataDirectory);
1362     strcat(testFileName, fileName);
1363
1364     logln("Opening data file %s\n", fileName);
1365
1366     int    len;
1367     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1368     if (status != U_FILE_ACCESS_ERROR) {
1369         TEST_ASSERT_SUCCESS(status);
1370         TEST_ASSERT(testFile != NULL);
1371     }
1372     if (U_FAILURE(status) || testFile == NULL) {
1373         return; /* something went wrong, error already output */
1374     }
1375     UnicodeString testFileAsString(TRUE, testFile, len);
1376
1377     //
1378     //  Parse the test data file using a regular expression.
1379     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1380     //     is identified by which group had a match.
1381     //
1382     //    Caputure Group #                  1          2            3            4           5
1383     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1384     //
1385     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1386     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1387     UnicodeString   testString;
1388     UVector32       breakPositions(status);
1389     int             lineNumber = 1;
1390     TEST_ASSERT_SUCCESS(status);
1391     if (U_FAILURE(status)) {
1392         return;
1393     }
1394
1395     //
1396     //  Scan through each test case, building up the string to be broken in testString,
1397     //   and the positions that should be boundaries in the breakPositions vector.
1398     //
1399     int spin = 0;
1400     while (tokenMatcher.find()) {
1401         if(tokenMatcher.hitEnd()) {
1402           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1403              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1404              and caused an infinite loop here on EBCDIC systems!
1405           */
1406           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1407           //       return;
1408         }
1409         if (tokenMatcher.start(1, status) >= 0) {
1410             // Scanned a divide sign, indicating a break position in the test data.
1411             if (testString.length()>0) {
1412                 breakPositions.addElement(testString.length(), status);
1413             }
1414         }
1415         else if (tokenMatcher.start(2, status) >= 0) {
1416             // Scanned an 'x', meaning no break at this position in the test data
1417             //   Nothing to be done here.
1418             }
1419         else if (tokenMatcher.start(3, status) >= 0) {
1420             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1421             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1422             int length = hexNumber.length();
1423             if (length<=8) {
1424                 char buf[10];
1425                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1426                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1427                 if (c<=0x10ffff) {
1428                     testString.append(c);
1429                 } else {
1430                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1431                        fileName, lineNumber);
1432                 }
1433             } else {
1434                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1435                        fileName, lineNumber);
1436              }
1437         }
1438         else if (tokenMatcher.start(4, status) >= 0) {
1439             // Scanned to end of a line, possibly skipping over a comment in the process.
1440             //   If the line from the file contained test data, run the test now.
1441             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1442                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1443             }
1444
1445             // Clear out this test case.
1446             //    The string and breakPositions vector will be refilled as the next
1447             //       test case is parsed.
1448             testString.remove();
1449             breakPositions.removeAllElements();
1450             lineNumber++;
1451         } else {
1452             // Scanner catchall.  Something unrecognized appeared on the line.
1453             char token[16];
1454             UnicodeString uToken = tokenMatcher.group(0, status);
1455             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1456             token[sizeof(token)-1] = 0;
1457             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1458
1459             // Clean up, in preparation for continuing with the next line.
1460             testString.remove();
1461             breakPositions.removeAllElements();
1462             lineNumber++;
1463         }
1464         TEST_ASSERT_SUCCESS(status);
1465         if (U_FAILURE(status)) {
1466             break;
1467         }
1468     }
1469
1470     delete [] testFile;
1471  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1472 }
1473
1474 //--------------------------------------------------------------------------------------------
1475 //
1476 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1477 //                            test data files.  Do only a simple, forward-only check -
1478 //                            this test is mostly to check that ICU and the Unicode
1479 //                            data agree with each other.
1480 //
1481 //--------------------------------------------------------------------------------------------
1482 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1483                          const UnicodeString &testString,   // Text data to be broken
1484                          UVector32 *breakPositions,         // Positions where breaks should be found.
1485                          RuleBasedBreakIterator *bi) {
1486     int32_t pos;                 // Break Position in the test string
1487     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1488     int32_t expectedPos;         // Expected break position (index into test string)
1489
1490     bi->setText(testString);
1491     pos = bi->first();
1492     pos = bi->next();
1493
1494     while (pos != BreakIterator::DONE) {
1495         if (expectedI >= breakPositions->size()) {
1496             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1497                 testFileName, lineNumber, pos);
1498             break;
1499         }
1500         expectedPos = breakPositions->elementAti(expectedI);
1501         if (pos < expectedPos) {
1502             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1503                 testFileName, lineNumber, pos);
1504             break;
1505         }
1506         if (pos > expectedPos) {
1507             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1508                 testFileName, lineNumber, expectedPos);
1509             break;
1510         }
1511         pos = bi->next();
1512         expectedI++;
1513     }
1514
1515     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1516         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1517             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1518     }
1519 }
1520
1521
1522
1523 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1524 //---------------------------------------------------------------------------------------
1525 //
1526 //   classs RBBIMonkeyKind
1527 //
1528 //      Monkey Test for Break Iteration
1529 //      Abstract interface class.   Concrete derived classes independently
1530 //      implement the break rules for different iterator types.
1531 //
1532 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1533 //      testing, but works purely in terms of the interface defined here.
1534 //
1535 //---------------------------------------------------------------------------------------
1536 class RBBIMonkeyKind {
1537 public:
1538     // Return a UVector of UnicodeSets, representing the character classes used
1539     //   for this type of iterator.
1540     virtual  UVector  *charClasses() = 0;
1541
1542     // Set the test text on which subsequent calls to next() will operate
1543     virtual  void      setText(const UnicodeString &s) = 0;
1544
1545     // Find the next break postion, starting from the prev break position, or from zero.
1546     // Return -1 after reaching end of string.
1547     virtual  int32_t   next(int32_t i) = 0;
1548
1549     virtual ~RBBIMonkeyKind();
1550     UErrorCode       deferredStatus;
1551
1552
1553 protected:
1554     RBBIMonkeyKind();
1555
1556 private:
1557 };
1558
1559 RBBIMonkeyKind::RBBIMonkeyKind() {
1560     deferredStatus = U_ZERO_ERROR;
1561 }
1562
1563 RBBIMonkeyKind::~RBBIMonkeyKind() {
1564 }
1565
1566
1567 //----------------------------------------------------------------------------------------
1568 //
1569 //   Random Numbers.  Similar to standard lib rand() and srand()
1570 //                    Not using library to
1571 //                      1.  Get same results on all platforms.
1572 //                      2.  Get access to current seed, to more easily reproduce failures.
1573 //
1574 //---------------------------------------------------------------------------------------
1575 static uint32_t m_seed = 1;
1576
1577 static uint32_t m_rand()
1578 {
1579     m_seed = m_seed * 1103515245 + 12345;
1580     return (uint32_t)(m_seed/65536) % 32768;
1581 }
1582
1583
1584 //------------------------------------------------------------------------------------------
1585 //
1586 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1587 //                             of RBBIMonkeyKind.
1588 //
1589 //------------------------------------------------------------------------------------------
1590 class RBBICharMonkey: public RBBIMonkeyKind {
1591 public:
1592     RBBICharMonkey();
1593     virtual          ~RBBICharMonkey();
1594     virtual  UVector *charClasses();
1595     virtual  void     setText(const UnicodeString &s);
1596     virtual  int32_t  next(int32_t i);
1597 private:
1598     UVector   *fSets;
1599
1600     UnicodeSet  *fCRLFSet;
1601     UnicodeSet  *fControlSet;
1602     UnicodeSet  *fExtendSet;
1603     UnicodeSet  *fZWJSet;
1604     UnicodeSet  *fRegionalIndicatorSet;
1605     UnicodeSet  *fPrependSet;
1606     UnicodeSet  *fSpacingSet;
1607     UnicodeSet  *fLSet;
1608     UnicodeSet  *fVSet;
1609     UnicodeSet  *fTSet;
1610     UnicodeSet  *fLVSet;
1611     UnicodeSet  *fLVTSet;
1612     UnicodeSet  *fHangulSet;
1613     UnicodeSet  *fExtendedPictSet;
1614     UnicodeSet  *fAnySet;
1615
1616     const UnicodeString *fText;
1617 };
1618
1619
1620 RBBICharMonkey::RBBICharMonkey() {
1621     UErrorCode  status = U_ZERO_ERROR;
1622
1623     fText = NULL;
1624
1625     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1626     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1627     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1628     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1629     fRegionalIndicatorSet =
1630                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1631     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1632     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1633     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1634     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1635     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1636     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1637     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1638     fHangulSet  = new UnicodeSet();
1639     fHangulSet->addAll(*fLSet);
1640     fHangulSet->addAll(*fVSet);
1641     fHangulSet->addAll(*fTSet);
1642     fHangulSet->addAll(*fLVSet);
1643     fHangulSet->addAll(*fLVTSet);
1644
1645     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1646     fAnySet           = new UnicodeSet(0, 0x10ffff);
1647
1648     fSets             = new UVector(status);
1649     fSets->addElement(fCRLFSet,    status);
1650     fSets->addElement(fControlSet, status);
1651     fSets->addElement(fExtendSet,  status);
1652     fSets->addElement(fRegionalIndicatorSet, status);
1653     if (!fPrependSet->isEmpty()) {
1654         fSets->addElement(fPrependSet, status);
1655     }
1656     fSets->addElement(fSpacingSet, status);
1657     fSets->addElement(fHangulSet,  status);
1658     fSets->addElement(fAnySet,     status);
1659     fSets->addElement(fZWJSet,     status);
1660     fSets->addElement(fExtendedPictSet, status);
1661     if (U_FAILURE(status)) {
1662         deferredStatus = status;
1663     }
1664 }
1665
1666
1667 void RBBICharMonkey::setText(const UnicodeString &s) {
1668     fText = &s;
1669 }
1670
1671
1672
1673 int32_t RBBICharMonkey::next(int32_t prevPos) {
1674     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1675                               //   break position being tested.  The candidate break
1676                               //   location is before p2.
1677
1678     int     breakPos = -1;
1679
1680     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1681     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1682
1683     if (U_FAILURE(deferredStatus)) {
1684         return -1;
1685     }
1686
1687     // Previous break at end of string.  return DONE.
1688     if (prevPos >= fText->length()) {
1689         return -1;
1690     }
1691     p0 = p1 = p2 = p3 = prevPos;
1692     c3 =  fText->char32At(prevPos);
1693     c0 = c1 = c2 = cBase = 0;
1694     (void)p0;   // suppress set but not used warning.
1695     (void)c0;
1696
1697     // Loop runs once per "significant" character position in the input text.
1698     for (;;) {
1699         // Move all of the positions forward in the input string.
1700         p0 = p1;  c0 = c1;
1701         p1 = p2;  c1 = c2;
1702         p2 = p3;  c2 = c3;
1703
1704         // Advancd p3 by one codepoint
1705         p3 = fText->moveIndex32(p3, 1);
1706         c3 = fText->char32At(p3);
1707
1708         if (p1 == p2) {
1709             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1710             continue;
1711         }
1712         if (p2 == fText->length()) {
1713             // Reached end of string.  Always a break position.
1714             break;
1715         }
1716
1717         // Rule  GB3   CR x LF
1718         //     No Extend or Format characters may appear between the CR and LF,
1719         //     which requires the additional check for p2 immediately following p1.
1720         //
1721         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1722             continue;
1723         }
1724
1725         // Rule (GB4).   ( Control | CR | LF ) <break>
1726         if (fControlSet->contains(c1) ||
1727             c1 == 0x0D ||
1728             c1 == 0x0A)  {
1729             break;
1730         }
1731
1732         // Rule (GB5)    <break>  ( Control | CR | LF )
1733         //
1734         if (fControlSet->contains(c2) ||
1735             c2 == 0x0D ||
1736             c2 == 0x0A)  {
1737             break;
1738         }
1739
1740
1741         // Rule (GB6)  L x ( L | V | LV | LVT )
1742         if (fLSet->contains(c1) &&
1743                (fLSet->contains(c2)  ||
1744                 fVSet->contains(c2)  ||
1745                 fLVSet->contains(c2) ||
1746                 fLVTSet->contains(c2))) {
1747             continue;
1748         }
1749
1750         // Rule (GB7)    ( LV | V )  x  ( V | T )
1751         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1752             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1753             continue;
1754         }
1755
1756         // Rule (GB8)    ( LVT | T)  x T
1757         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1758             fTSet->contains(c2))  {
1759             continue;
1760         }
1761
1762         // Rule (GB9)    x (Extend | ZWJ)
1763         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1764             if (!fExtendSet->contains(c1)) {
1765                 cBase = c1;
1766             }
1767             continue;
1768         }
1769
1770         // Rule (GB9a)   x  SpacingMark
1771         if (fSpacingSet->contains(c2)) {
1772             continue;
1773         }
1774
1775         // Rule (GB9b)   Prepend x
1776         if (fPrependSet->contains(c1)) {
1777             continue;
1778         }
1779
1780         // Rule (GB11)   Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1781         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1782             continue;
1783         }
1784
1785         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
1786         //                   Note: The first if condition is a little tricky. We only need to force
1787         //                      a break if there are three or more contiguous RIs. If there are
1788         //                      only two, a break following will occur via other rules, and will include
1789         //                      any trailing extend characters, which is needed behavior.
1790         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1791                 && fRegionalIndicatorSet->contains(c2)) {
1792             break;
1793         }
1794         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1795             continue;
1796         }
1797
1798         // Rule (GB999)  Any  <break>  Any
1799         break;
1800     }
1801
1802     breakPos = p2;
1803     return breakPos;
1804 }
1805
1806
1807
1808 UVector  *RBBICharMonkey::charClasses() {
1809     return fSets;
1810 }
1811
1812
1813 RBBICharMonkey::~RBBICharMonkey() {
1814     delete fSets;
1815     delete fCRLFSet;
1816     delete fControlSet;
1817     delete fExtendSet;
1818     delete fRegionalIndicatorSet;
1819     delete fPrependSet;
1820     delete fSpacingSet;
1821     delete fLSet;
1822     delete fVSet;
1823     delete fTSet;
1824     delete fLVSet;
1825     delete fLVTSet;
1826     delete fHangulSet;
1827     delete fAnySet;
1828     delete fZWJSet;
1829     delete fExtendedPictSet;
1830 }
1831
1832 //------------------------------------------------------------------------------------------
1833 //
1834 //   class RBBIWordMonkey      Word Break specific implementation
1835 //                             of RBBIMonkeyKind.
1836 //
1837 //------------------------------------------------------------------------------------------
1838 class RBBIWordMonkey: public RBBIMonkeyKind {
1839 public:
1840     RBBIWordMonkey();
1841     virtual          ~RBBIWordMonkey();
1842     virtual  UVector *charClasses();
1843     virtual  void     setText(const UnicodeString &s);
1844     virtual int32_t   next(int32_t i);
1845 private:
1846     UVector      *fSets;
1847
1848     UnicodeSet  *fCRSet;
1849     UnicodeSet  *fLFSet;
1850     UnicodeSet  *fNewlineSet;
1851     UnicodeSet  *fRegionalIndicatorSet;
1852     UnicodeSet  *fKatakanaSet;
1853     UnicodeSet  *fHebrew_LetterSet;
1854     UnicodeSet  *fALetterSet;
1855     UnicodeSet  *fSingle_QuoteSet;
1856     UnicodeSet  *fDouble_QuoteSet;
1857     UnicodeSet  *fMidNumLetSet;
1858     UnicodeSet  *fMidLetterSet;
1859     UnicodeSet  *fMidNumSet;
1860     UnicodeSet  *fNumericSet;
1861     UnicodeSet  *fFormatSet;
1862     UnicodeSet  *fOtherSet;
1863     UnicodeSet  *fExtendSet;
1864     UnicodeSet  *fExtendNumLetSet;
1865     UnicodeSet  *fWSegSpaceSet;
1866     UnicodeSet  *fDictionarySet;
1867     UnicodeSet  *fZWJSet;
1868     UnicodeSet  *fExtendedPictSet;
1869
1870     const UnicodeString  *fText;
1871 };
1872
1873
1874 RBBIWordMonkey::RBBIWordMonkey()
1875 {
1876     UErrorCode  status = U_ZERO_ERROR;
1877
1878     fSets            = new UVector(status);
1879
1880     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1881     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1882     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1883     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1884     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1885     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1886     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1887     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1888     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1889     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1890     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
1891     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1892     fNumericSet       = new UnicodeSet(u"[[\\p{Word_Break = Numeric}][\\uff10-\\uff19]]", status);
1893     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1894     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1895     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
1896     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1897
1898     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1899     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1900
1901     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1902     fDictionarySet->addAll(*fKatakanaSet);
1903     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1904
1905     fALetterSet->removeAll(*fDictionarySet);
1906
1907     fOtherSet        = new UnicodeSet();
1908     if(U_FAILURE(status)) {
1909         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1910         deferredStatus = status;
1911         return;
1912     }
1913
1914     fOtherSet->complement();
1915     fOtherSet->removeAll(*fCRSet);
1916     fOtherSet->removeAll(*fLFSet);
1917     fOtherSet->removeAll(*fNewlineSet);
1918     fOtherSet->removeAll(*fKatakanaSet);
1919     fOtherSet->removeAll(*fHebrew_LetterSet);
1920     fOtherSet->removeAll(*fALetterSet);
1921     fOtherSet->removeAll(*fSingle_QuoteSet);
1922     fOtherSet->removeAll(*fDouble_QuoteSet);
1923     fOtherSet->removeAll(*fMidLetterSet);
1924     fOtherSet->removeAll(*fMidNumSet);
1925     fOtherSet->removeAll(*fNumericSet);
1926     fOtherSet->removeAll(*fExtendNumLetSet);
1927     fOtherSet->removeAll(*fWSegSpaceSet);
1928     fOtherSet->removeAll(*fFormatSet);
1929     fOtherSet->removeAll(*fExtendSet);
1930     fOtherSet->removeAll(*fRegionalIndicatorSet);
1931     fOtherSet->removeAll(*fZWJSet);
1932     fOtherSet->removeAll(*fExtendedPictSet);
1933
1934     // Inhibit dictionary characters from being tested at all.
1935     fOtherSet->removeAll(*fDictionarySet);
1936
1937     fSets->addElement(fCRSet,                status);
1938     fSets->addElement(fLFSet,                status);
1939     fSets->addElement(fNewlineSet,           status);
1940     fSets->addElement(fRegionalIndicatorSet, status);
1941     fSets->addElement(fHebrew_LetterSet,     status);
1942     fSets->addElement(fALetterSet,           status);
1943     fSets->addElement(fSingle_QuoteSet,      status);
1944     fSets->addElement(fDouble_QuoteSet,      status);
1945     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
1946                                                         // from the test data. They are all in the dictionary set,
1947                                                         // which this (old, to be retired) monkey test cannot handle.
1948     fSets->addElement(fMidLetterSet,         status);
1949     fSets->addElement(fMidNumLetSet,         status);
1950     fSets->addElement(fMidNumSet,            status);
1951     fSets->addElement(fNumericSet,           status);
1952     fSets->addElement(fFormatSet,            status);
1953     fSets->addElement(fExtendSet,            status);
1954     fSets->addElement(fOtherSet,             status);
1955     fSets->addElement(fExtendNumLetSet,      status);
1956     fSets->addElement(fWSegSpaceSet,         status);
1957
1958     fSets->addElement(fZWJSet,               status);
1959     fSets->addElement(fExtendedPictSet,      status);
1960
1961     if (U_FAILURE(status)) {
1962         deferredStatus = status;
1963     }
1964 }
1965
1966 void RBBIWordMonkey::setText(const UnicodeString &s) {
1967     fText       = &s;
1968 }
1969
1970
1971 int32_t RBBIWordMonkey::next(int32_t prevPos) {
1972     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1973                               //   break position being tested.  The candidate break
1974                               //   location is before p2.
1975
1976     int     breakPos = -1;
1977
1978     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1979
1980     if (U_FAILURE(deferredStatus)) {
1981         return -1;
1982     }
1983
1984     // Prev break at end of string.  return DONE.
1985     if (prevPos >= fText->length()) {
1986         return -1;
1987     }
1988     p0 = p1 = p2 = p3 = prevPos;
1989     c3 =  fText->char32At(prevPos);
1990     c0 = c1 = c2 = 0;
1991     (void)p0;       // Suppress set but not used warning.
1992
1993     // Loop runs once per "significant" character position in the input text.
1994     for (;;) {
1995         // Move all of the positions forward in the input string.
1996         p0 = p1;  c0 = c1;
1997         p1 = p2;  c1 = c2;
1998         p2 = p3;  c2 = c3;
1999
2000         // Advancd p3 by    X(Extend | Format)*   Rule 4
2001         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2002         do {
2003             p3 = fText->moveIndex32(p3, 1);
2004             c3 = fText->char32At(p3);
2005             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2006                break;
2007             };
2008         }
2009         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2010
2011
2012         if (p1 == p2) {
2013             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2014             continue;
2015         }
2016         if (p2 == fText->length()) {
2017             // Reached end of string.  Always a break position.
2018             break;
2019         }
2020
2021         // Rule  (3)   CR x LF
2022         //     No Extend or Format characters may appear between the CR and LF,
2023         //     which requires the additional check for p2 immediately following p1.
2024         //
2025         if (c1==0x0D && c2==0x0A) {
2026             continue;
2027         }
2028
2029         // Rule (3a)  Break before and after newlines (including CR and LF)
2030         //
2031         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2032             break;
2033         };
2034         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2035             break;
2036         };
2037
2038         // Rule (3c)    ZWJ x Extended_Pictographic
2039         //              Not ignoring extend chars, so peek into input text to
2040         //              get the potential ZWJ, the character immediately preceding c2.
2041         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2042         //              but char32At will get the full code point.
2043         if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2044             continue;
2045         }
2046
2047         // Rule (3d)    Keep horizontal whitespace together.
2048         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2049             continue;
2050         }
2051
2052         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2053         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2054             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2055             continue;
2056         }
2057
2058         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2059         //
2060         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2061              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2062              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2063             continue;
2064         }
2065
2066         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2067         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2068             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2069             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2070             continue;
2071         }
2072
2073         // Rule (7a)     Hebrew_Letter x Single_Quote
2074         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2075             continue;
2076         }
2077
2078         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2079         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2080             continue;
2081         }
2082
2083         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2084         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2085             continue;
2086         }
2087
2088         // Rule (8)    Numeric x Numeric
2089         if (fNumericSet->contains(c1) &&
2090             fNumericSet->contains(c2))  {
2091             continue;
2092         }
2093
2094         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2095         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2096             fNumericSet->contains(c2))  {
2097             continue;
2098         }
2099
2100         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2101         if (fNumericSet->contains(c1) &&
2102             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2103             continue;
2104         }
2105
2106         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2107         if (fNumericSet->contains(c0) &&
2108             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2109             fNumericSet->contains(c2)) {
2110             continue;
2111         }
2112
2113         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2114         if (fNumericSet->contains(c1) &&
2115             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2116             fNumericSet->contains(c3)) {
2117             continue;
2118         }
2119
2120         // Rule (13)  Katakana x Katakana
2121         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2122         //                  all Katakana are handled by the dictionary breaker.
2123         if (fKatakanaSet->contains(c1) &&
2124             fKatakanaSet->contains(c2))  {
2125             continue;
2126         }
2127
2128         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2129         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2130              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2131              fExtendNumLetSet->contains(c2)) {
2132                 continue;
2133         }
2134
2135         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2136         if (fExtendNumLetSet->contains(c1) &&
2137                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2138                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2139             continue;
2140         }
2141
2142         // Rule 15 - 17   Group pairs of Regional Indicators.
2143         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2144             break;
2145         }
2146         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2147             continue;
2148         }
2149
2150         // Rule 999.  Break found here.
2151         break;
2152     }
2153
2154     breakPos = p2;
2155     return breakPos;
2156 }
2157
2158
2159 UVector  *RBBIWordMonkey::charClasses() {
2160     return fSets;
2161 }
2162
2163
2164 RBBIWordMonkey::~RBBIWordMonkey() {
2165     delete fSets;
2166     delete fCRSet;
2167     delete fLFSet;
2168     delete fNewlineSet;
2169     delete fKatakanaSet;
2170     delete fHebrew_LetterSet;
2171     delete fALetterSet;
2172     delete fSingle_QuoteSet;
2173     delete fDouble_QuoteSet;
2174     delete fMidNumLetSet;
2175     delete fMidLetterSet;
2176     delete fMidNumSet;
2177     delete fNumericSet;
2178     delete fFormatSet;
2179     delete fExtendSet;
2180     delete fExtendNumLetSet;
2181     delete fWSegSpaceSet;
2182     delete fRegionalIndicatorSet;
2183     delete fDictionarySet;
2184     delete fOtherSet;
2185     delete fZWJSet;
2186     delete fExtendedPictSet;
2187 }
2188
2189
2190
2191
2192 //------------------------------------------------------------------------------------------
2193 //
2194 //   class RBBISentMonkey      Sentence Break specific implementation
2195 //                             of RBBIMonkeyKind.
2196 //
2197 //------------------------------------------------------------------------------------------
2198 class RBBISentMonkey: public RBBIMonkeyKind {
2199 public:
2200     RBBISentMonkey();
2201     virtual          ~RBBISentMonkey();
2202     virtual  UVector *charClasses();
2203     virtual  void     setText(const UnicodeString &s);
2204     virtual int32_t   next(int32_t i);
2205 private:
2206     int               moveBack(int posFrom);
2207     int               moveForward(int posFrom);
2208     UChar32           cAt(int pos);
2209
2210     UVector      *fSets;
2211
2212     UnicodeSet  *fSepSet;
2213     UnicodeSet  *fFormatSet;
2214     UnicodeSet  *fSpSet;
2215     UnicodeSet  *fLowerSet;
2216     UnicodeSet  *fUpperSet;
2217     UnicodeSet  *fOLetterSet;
2218     UnicodeSet  *fNumericSet;
2219     UnicodeSet  *fATermSet;
2220     UnicodeSet  *fSContinueSet;
2221     UnicodeSet  *fSTermSet;
2222     UnicodeSet  *fCloseSet;
2223     UnicodeSet  *fOtherSet;
2224     UnicodeSet  *fExtendSet;
2225
2226     const UnicodeString  *fText;
2227
2228 };
2229
2230 RBBISentMonkey::RBBISentMonkey()
2231 {
2232     UErrorCode  status = U_ZERO_ERROR;
2233
2234     fSets            = new UVector(status);
2235
2236     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2237     //                       set and made into character classes of their own.  For the monkey impl,
2238     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2239     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2240     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2241     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2242     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2243     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2244     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2245     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2246     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2247     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2248     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2249     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2250     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2251     fOtherSet        = new UnicodeSet();
2252
2253     if(U_FAILURE(status)) {
2254       deferredStatus = status;
2255       return;
2256     }
2257
2258     fOtherSet->complement();
2259     fOtherSet->removeAll(*fSepSet);
2260     fOtherSet->removeAll(*fFormatSet);
2261     fOtherSet->removeAll(*fSpSet);
2262     fOtherSet->removeAll(*fLowerSet);
2263     fOtherSet->removeAll(*fUpperSet);
2264     fOtherSet->removeAll(*fOLetterSet);
2265     fOtherSet->removeAll(*fNumericSet);
2266     fOtherSet->removeAll(*fATermSet);
2267     fOtherSet->removeAll(*fSContinueSet);
2268     fOtherSet->removeAll(*fSTermSet);
2269     fOtherSet->removeAll(*fCloseSet);
2270     fOtherSet->removeAll(*fExtendSet);
2271
2272     fSets->addElement(fSepSet,       status);
2273     fSets->addElement(fFormatSet,    status);
2274     fSets->addElement(fSpSet,        status);
2275     fSets->addElement(fLowerSet,     status);
2276     fSets->addElement(fUpperSet,     status);
2277     fSets->addElement(fOLetterSet,   status);
2278     fSets->addElement(fNumericSet,   status);
2279     fSets->addElement(fATermSet,     status);
2280     fSets->addElement(fSContinueSet, status);
2281     fSets->addElement(fSTermSet,     status);
2282     fSets->addElement(fCloseSet,     status);
2283     fSets->addElement(fOtherSet,     status);
2284     fSets->addElement(fExtendSet,    status);
2285
2286     if (U_FAILURE(status)) {
2287         deferredStatus = status;
2288     }
2289 }
2290
2291
2292
2293 void RBBISentMonkey::setText(const UnicodeString &s) {
2294     fText       = &s;
2295 }
2296
2297 UVector  *RBBISentMonkey::charClasses() {
2298     return fSets;
2299 }
2300
2301
2302 //  moveBack()   Find the "significant" code point preceding the index i.
2303 //               Skips over ($Extend | $Format)* .
2304 //
2305 int RBBISentMonkey::moveBack(int i) {
2306     if (i <= 0) {
2307         return -1;
2308     }
2309     UChar32   c;
2310     int32_t   j = i;
2311     do {
2312         j = fText->moveIndex32(j, -1);
2313         c = fText->char32At(j);
2314     }
2315     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2316     return j;
2317
2318  }
2319
2320
2321 int RBBISentMonkey::moveForward(int i) {
2322     if (i>=fText->length()) {
2323         return fText->length();
2324     }
2325     UChar32   c;
2326     int32_t   j = i;
2327     do {
2328         j = fText->moveIndex32(j, 1);
2329         c = cAt(j);
2330     }
2331     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2332     return j;
2333 }
2334
2335 UChar32 RBBISentMonkey::cAt(int pos) {
2336     if (pos<0 || pos>=fText->length()) {
2337         return -1;
2338     } else {
2339         return fText->char32At(pos);
2340     }
2341 }
2342
2343 int32_t RBBISentMonkey::next(int32_t prevPos) {
2344     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2345                               //   break position being tested.  The candidate break
2346                               //   location is before p2.
2347
2348     int     breakPos = -1;
2349
2350     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2351     UChar32 c;
2352
2353     if (U_FAILURE(deferredStatus)) {
2354         return -1;
2355     }
2356
2357     // Prev break at end of string.  return DONE.
2358     if (prevPos >= fText->length()) {
2359         return -1;
2360     }
2361     p0 = p1 = p2 = p3 = prevPos;
2362     c3 =  fText->char32At(prevPos);
2363     c0 = c1 = c2 = 0;
2364     (void)p0;     // Suppress set but not used warning.
2365
2366     // Loop runs once per "significant" character position in the input text.
2367     for (;;) {
2368         // Move all of the positions forward in the input string.
2369         p0 = p1;  c0 = c1;
2370         p1 = p2;  c1 = c2;
2371         p2 = p3;  c2 = c3;
2372
2373         // Advancd p3 by    X(Extend | Format)*   Rule 4
2374         p3 = moveForward(p3);
2375         c3 = cAt(p3);
2376
2377         // Rule (3)  CR x LF
2378         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2379             continue;
2380         }
2381
2382         // Rule (4).   Sep  <break>
2383         if (fSepSet->contains(c1)) {
2384             p2 = p1+1;   // Separators don't combine with Extend or Format.
2385             break;
2386         }
2387
2388         if (p2 >= fText->length()) {
2389             // Reached end of string.  Always a break position.
2390             break;
2391         }
2392
2393         if (p2 == prevPos) {
2394             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2395             continue;
2396         }
2397
2398         // Rule (6).   ATerm x Numeric
2399         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2400             continue;
2401         }
2402
2403         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2404         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2405                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2406             continue;
2407         }
2408
2409         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2410         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2411         //                  note to the Unicode 5.0 documents.
2412         int p8 = p1;
2413         while (fSpSet->contains(cAt(p8))) {
2414             p8 = moveBack(p8);
2415         }
2416         while (fCloseSet->contains(cAt(p8))) {
2417             p8 = moveBack(p8);
2418         }
2419         if (fATermSet->contains(cAt(p8))) {
2420             p8=p2;
2421             for (;;) {
2422                 c = cAt(p8);
2423                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2424                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2425                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2426                     break;
2427                 }
2428                 p8 = moveForward(p8);
2429             }
2430             if (fLowerSet->contains(cAt(p8))) {
2431                 continue;
2432             }
2433         }
2434
2435         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2436         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2437             p8 = p1;
2438             while (fSpSet->contains(cAt(p8))) {
2439                 p8 = moveBack(p8);
2440             }
2441             while (fCloseSet->contains(cAt(p8))) {
2442                 p8 = moveBack(p8);
2443             }
2444             c = cAt(p8);
2445             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2446                 continue;
2447             }
2448         }
2449
2450         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2451         int p9 = p1;
2452         while (fCloseSet->contains(cAt(p9))) {
2453             p9 = moveBack(p9);
2454         }
2455         c = cAt(p9);
2456         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2457             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2458                 continue;
2459             }
2460         }
2461
2462         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2463         int p10 = p1;
2464         while (fSpSet->contains(cAt(p10))) {
2465             p10 = moveBack(p10);
2466         }
2467         while (fCloseSet->contains(cAt(p10))) {
2468             p10 = moveBack(p10);
2469         }
2470         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2471             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2472                 continue;
2473             }
2474         }
2475
2476         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2477         int p11 = p1;
2478         if (fSepSet->contains(cAt(p11))) {
2479             p11 = moveBack(p11);
2480         }
2481         while (fSpSet->contains(cAt(p11))) {
2482             p11 = moveBack(p11);
2483         }
2484         while (fCloseSet->contains(cAt(p11))) {
2485             p11 = moveBack(p11);
2486         }
2487         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2488             break;
2489         }
2490
2491         //  Rule (12)  Any x Any
2492         continue;
2493     }
2494     breakPos = p2;
2495     return breakPos;
2496 }
2497
2498 RBBISentMonkey::~RBBISentMonkey() {
2499     delete fSets;
2500     delete fSepSet;
2501     delete fFormatSet;
2502     delete fSpSet;
2503     delete fLowerSet;
2504     delete fUpperSet;
2505     delete fOLetterSet;
2506     delete fNumericSet;
2507     delete fATermSet;
2508     delete fSContinueSet;
2509     delete fSTermSet;
2510     delete fCloseSet;
2511     delete fOtherSet;
2512     delete fExtendSet;
2513 }
2514
2515
2516
2517 //-------------------------------------------------------------------------------------------
2518 //
2519 //  RBBILineMonkey
2520 //
2521 //-------------------------------------------------------------------------------------------
2522
2523 class RBBILineMonkey: public RBBIMonkeyKind {
2524 public:
2525     RBBILineMonkey();
2526     virtual          ~RBBILineMonkey();
2527     virtual  UVector *charClasses();
2528     virtual  void     setText(const UnicodeString &s);
2529     virtual  int32_t  next(int32_t i);
2530     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2531 private:
2532     UVector      *fSets;
2533
2534     UnicodeSet  *fBK;
2535     UnicodeSet  *fCR;
2536     UnicodeSet  *fLF;
2537     UnicodeSet  *fCM;
2538     UnicodeSet  *fNL;
2539     UnicodeSet  *fSG;
2540     UnicodeSet  *fWJ;
2541     UnicodeSet  *fZW;
2542     UnicodeSet  *fGL;
2543     UnicodeSet  *fCB;
2544     UnicodeSet  *fSP;
2545     UnicodeSet  *fB2;
2546     UnicodeSet  *fBA;
2547     UnicodeSet  *fBB;
2548     UnicodeSet  *fHH;
2549     UnicodeSet  *fHY;
2550     UnicodeSet  *fH2;
2551     UnicodeSet  *fH3;
2552     UnicodeSet  *fCL;
2553     UnicodeSet  *fCP;
2554     UnicodeSet  *fEX;
2555     UnicodeSet  *fIN;
2556     UnicodeSet  *fJL;
2557     UnicodeSet  *fJV;
2558     UnicodeSet  *fJT;
2559     UnicodeSet  *fNS;
2560     UnicodeSet  *fOP;
2561     UnicodeSet  *fQU;
2562     UnicodeSet  *fIS;
2563     UnicodeSet  *fNU;
2564     UnicodeSet  *fPO;
2565     UnicodeSet  *fPR;
2566     UnicodeSet  *fSY;
2567     UnicodeSet  *fAI;
2568     UnicodeSet  *fAL;
2569     UnicodeSet  *fCJ;
2570     UnicodeSet  *fHL;
2571     UnicodeSet  *fID;
2572     UnicodeSet  *fRI;
2573     UnicodeSet  *fXX;
2574     UnicodeSet  *fEB;
2575     UnicodeSet  *fEM;
2576     UnicodeSet  *fZWJ;
2577
2578     BreakIterator        *fCharBI;
2579     const UnicodeString  *fText;
2580     RegexMatcher         *fNumberMatcher;
2581 };
2582
2583 RBBILineMonkey::RBBILineMonkey() :
2584     RBBIMonkeyKind(),
2585     fSets(NULL),
2586
2587     fCharBI(NULL),
2588     fText(NULL),
2589     fNumberMatcher(NULL)
2590
2591 {
2592     if (U_FAILURE(deferredStatus)) {
2593         return;
2594     }
2595
2596     UErrorCode  status = U_ZERO_ERROR;
2597
2598     fSets  = new UVector(status);
2599
2600     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2601     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2602     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2603     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2604     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2605     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2606     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2607     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2608     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2609     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2610     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2611     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2612     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2613     fHH    = new UnicodeSet();
2614     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2615     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2616     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2617     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2618     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2619     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2620     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2621     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2622     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2623     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2624     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2625     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2626     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2627     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2628     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2629     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2630     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2631     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2632     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2633     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2634     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2635     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2636     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2637     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2638     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2639     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2640     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2641     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2642     fZWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2643
2644     if (U_FAILURE(status)) {
2645         deferredStatus = status;
2646         return;
2647     }
2648
2649     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2650     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2651     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2652
2653     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2654     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2655
2656     fHH->add(u'\u2010');   // Hyphen, '‐'
2657
2658     fSets->addElement(fBK, status);
2659     fSets->addElement(fCR, status);
2660     fSets->addElement(fLF, status);
2661     fSets->addElement(fCM, status);
2662     fSets->addElement(fNL, status);
2663     fSets->addElement(fWJ, status);
2664     fSets->addElement(fZW, status);
2665     fSets->addElement(fGL, status);
2666     fSets->addElement(fCB, status);
2667     fSets->addElement(fSP, status);
2668     fSets->addElement(fB2, status);
2669     fSets->addElement(fBA, status);
2670     fSets->addElement(fBB, status);
2671     fSets->addElement(fHY, status);
2672     fSets->addElement(fH2, status);
2673     fSets->addElement(fH3, status);
2674     fSets->addElement(fCL, status);
2675     fSets->addElement(fCP, status);
2676     fSets->addElement(fEX, status);
2677     fSets->addElement(fIN, status);
2678     fSets->addElement(fJL, status);
2679     fSets->addElement(fJT, status);
2680     fSets->addElement(fJV, status);
2681     fSets->addElement(fNS, status);
2682     fSets->addElement(fOP, status);
2683     fSets->addElement(fQU, status);
2684     fSets->addElement(fIS, status);
2685     fSets->addElement(fNU, status);
2686     fSets->addElement(fPO, status);
2687     fSets->addElement(fPR, status);
2688     fSets->addElement(fSY, status);
2689     fSets->addElement(fAI, status);
2690     fSets->addElement(fAL, status);
2691     fSets->addElement(fHL, status);
2692     fSets->addElement(fID, status);
2693     fSets->addElement(fWJ, status);
2694     fSets->addElement(fRI, status);
2695     fSets->addElement(fSG, status);
2696     fSets->addElement(fEB, status);
2697     fSets->addElement(fEM, status);
2698     fSets->addElement(fZWJ, status);
2699
2700
2701     const char *rules =
2702             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2703             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2704             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2705             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2706             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2707             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2708             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2709
2710     fNumberMatcher = new RegexMatcher(
2711         UnicodeString(rules, -1, US_INV), 0, status);
2712
2713     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2714
2715     if (U_FAILURE(status)) {
2716         deferredStatus = status;
2717     }
2718 }
2719
2720
2721 void RBBILineMonkey::setText(const UnicodeString &s) {
2722     fText       = &s;
2723     fCharBI->setText(s);
2724     fNumberMatcher->reset(s);
2725 }
2726
2727 //
2728 //  rule9Adjust
2729 //     Line Break TR rules 9 and 10 implementation.
2730 //     This deals with combining marks and other sequences that
2731 //     that must be treated as if they were something other than what they actually are.
2732 //
2733 //     This is factored out into a separate function because it must be applied twice for
2734 //     each potential break, once to the chars before the position being checked, then
2735 //     again to the text following the possible break.
2736 //
2737 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2738     if (pos == -1) {
2739         // Invalid initial position.  Happens during the warmup iteration of the
2740         //   main loop in next().
2741         return;
2742     }
2743
2744     int32_t  nPos = *nextPos;
2745
2746     // LB 9  Keep combining sequences together.
2747     //  advance over any CM class chars.  Note that Line Break CM is different
2748     //  from the normal Grapheme Extend property.
2749     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2750           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2751         for (;;) {
2752             *nextChar = fText->char32At(nPos);
2753             if (!fCM->contains(*nextChar)) {
2754                 break;
2755             }
2756             nPos = fText->moveIndex32(nPos, 1);
2757         }
2758     }
2759
2760
2761     // LB 9 Treat X CM* as if it were x.
2762     //       No explicit action required.
2763
2764     // LB 10  Treat any remaining combining mark as AL
2765     if (fCM->contains(*posChar)) {
2766         *posChar = u'A';
2767     }
2768
2769     // Push the updated nextPos and nextChar back to our caller.
2770     // This only makes a difference if posChar got bigger by consuming a
2771     // combining sequence.
2772     *nextPos  = nPos;
2773     *nextChar = fText->char32At(nPos);
2774 }
2775
2776
2777
2778 int32_t RBBILineMonkey::next(int32_t startPos) {
2779     UErrorCode status = U_ZERO_ERROR;
2780     int32_t    pos;       //  Index of the char following a potential break position
2781     UChar32    thisChar;  //  Character at above position "pos"
2782
2783     int32_t    prevPos;   //  Index of the char preceding a potential break position
2784     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2785                           //   and thisChar may not be adjacent because combining
2786                           //   characters between them will be ignored.
2787
2788     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2789     UChar32    prevCharX2;
2790
2791     int32_t    nextPos;   //  Index of the next character following pos.
2792                           //     Usually skips over combining marks.
2793     int32_t    nextCPPos; //  Index of the code point following "pos."
2794                           //     May point to a combining mark.
2795     int32_t    tPos;      //  temp value.
2796     UChar32    c;
2797
2798     if (U_FAILURE(deferredStatus)) {
2799         return -1;
2800     }
2801
2802     if (startPos >= fText->length()) {
2803         return -1;
2804     }
2805
2806
2807     // Initial values for loop.  Loop will run the first time without finding breaks,
2808     //                           while the invalid values shift out and the "this" and
2809     //                           "prev" positions are filled in with good values.
2810     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2811     thisChar = prevChar  = prevCharX2 = 0;
2812     nextPos  = nextCPPos = startPos;
2813
2814
2815     // Loop runs once per position in the test text, until a break position
2816     //  is found.
2817     for (;;) {
2818         prevPosX2 = prevPos;
2819         prevCharX2 = prevChar;
2820
2821         prevPos   = pos;
2822         prevChar  = thisChar;
2823
2824         pos       = nextPos;
2825         thisChar  = fText->char32At(pos);
2826
2827         nextCPPos = fText->moveIndex32(pos, 1);
2828         nextPos   = nextCPPos;
2829
2830         // Rule LB2 - Break at end of text.
2831         if (pos >= fText->length()) {
2832             break;
2833         }
2834
2835         // Rule LB 9 - adjust for combining sequences.
2836         //             We do this one out-of-order because the adjustment does not change anything
2837         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2838         //             be applied.
2839         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2840         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2841         c = fText->char32At(nextPos);
2842         rule9Adjust(pos,     &thisChar, &nextPos, &c);
2843
2844         // If the loop is still warming up - if we haven't shifted the initial
2845         //   -1 positions out of prevPos yet - loop back to advance the
2846         //    position in the input without any further looking for breaks.
2847         if (prevPos == -1) {
2848             continue;
2849         }
2850
2851         // LB 4  Always break after hard line breaks,
2852         if (fBK->contains(prevChar)) {
2853             break;
2854         }
2855
2856         // LB 5  Break after CR, LF, NL, but not inside CR LF
2857         if (prevChar == 0x0d && thisChar == 0x0a) {
2858             continue;
2859         }
2860         if (prevChar == 0x0d ||
2861             prevChar == 0x0a ||
2862             prevChar == 0x85)  {
2863             break;
2864         }
2865
2866         // LB 6  Don't break before hard line breaks
2867         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2868             fBK->contains(thisChar)) {
2869                 continue;
2870         }
2871
2872
2873         // LB 7  Don't break before spaces or zero-width space.
2874         if (fSP->contains(thisChar)) {
2875             continue;
2876         }
2877
2878         if (fZW->contains(thisChar)) {
2879             continue;
2880         }
2881
2882         // LB 8  Break after zero width space
2883         //       ZW SP* ÷
2884         //       Scan backwards from prevChar for SP* ZW
2885         tPos = prevPos;
2886         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2887             tPos = fText->moveIndex32(tPos, -1);
2888         }
2889         if (fZW->contains(fText->char32At(tPos))) {
2890             break;
2891         }
2892
2893         // LB 25    Numbers
2894         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2895         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2896         if (fNumberMatcher->lookingAt(prevPos, status)) {
2897             if (U_FAILURE(status)) {
2898                 break;
2899             }
2900             // Matched a number.  But could have been just a single digit, which would
2901             //    not represent a "no break here" between prevChar and thisChar
2902             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2903             if (numEndIdx > pos) {
2904                 // Number match includes at least our two chars being checked
2905                 if (numEndIdx > nextPos) {
2906                     // Number match includes additional chars.  Update pos and nextPos
2907                     //   so that next loop iteration will continue at the end of the number,
2908                     //   checking for breaks between last char in number & whatever follows.
2909                     pos = nextPos = numEndIdx;
2910                     do {
2911                         pos = fText->moveIndex32(pos, -1);
2912                         thisChar = fText->char32At(pos);
2913                     } while (fCM->contains(thisChar));
2914                 }
2915                 continue;
2916             }
2917         }
2918
2919         // LB 8a ZWJ x
2920         //       The monkey test's way of ignoring combining characters doesn't work
2921         //       for this rule. ZJ is also a CM. Need to get the actual character
2922         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
2923         {
2924             int32_t prevIdx = fText->moveIndex32(pos, -1);
2925             UChar32 prevC = fText->char32At(prevIdx);
2926             if (fZWJ->contains(prevC)) {
2927                 continue;
2928             }
2929         }
2930
2931         // LB 9, 10  Already done, at top of loop.
2932         //
2933
2934
2935         // LB 11  Do not break before or after WORD JOINER and related characters.
2936         //    x  WJ
2937         //    WJ  x
2938         //
2939         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2940             continue;
2941         }
2942
2943         // LB 12
2944         //    GL  x
2945         if (fGL->contains(prevChar)) {
2946             continue;
2947         }
2948
2949         // LB 12a
2950         //    [^SP BA HY] x GL
2951         if (!(fSP->contains(prevChar) ||
2952               fBA->contains(prevChar) ||
2953               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
2954             continue;
2955         }
2956
2957         // LB 13  Don't break before closings.
2958         //
2959         if (fCL->contains(thisChar) ||
2960                 fCP->contains(thisChar) ||
2961                 fEX->contains(thisChar) ||
2962                 fSY->contains(thisChar)) {
2963             continue;
2964         }
2965
2966         // LB 14 Don't break after OP SP*
2967         //       Scan backwards, checking for this sequence.
2968         //       The OP char could include combining marks, so we actually check for
2969         //           OP CM* SP*
2970         //       Another Twist: The Rule 9 fixes may have changed a SP CM
2971         //       sequence into a ID char, so before scanning back through spaces,
2972         //       verify that prevChar is indeed a space.  The prevChar variable
2973         //       may differ from fText[prevPos]
2974         tPos = prevPos;
2975         if (fSP->contains(prevChar)) {
2976             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2977                 tPos=fText->moveIndex32(tPos, -1);
2978             }
2979         }
2980         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2981             tPos=fText->moveIndex32(tPos, -1);
2982         }
2983         if (fOP->contains(fText->char32At(tPos))) {
2984             continue;
2985         }
2986
2987
2988         // LB 14a Break before an IS that begins a number and follows a space
2989         if (nextPos < fText->length()) {
2990             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
2991             //       from a legit ffff character. So test length separately.
2992             UChar32 nextChar = fText->char32At(nextPos);
2993             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
2994                 break;
2995             }
2996         }
2997
2998         // LB14b Do not break before numeric separators, even after spaces.
2999         if (fIS->contains(thisChar)) {
3000             continue;
3001         }
3002
3003         // LB 15    QU SP* x OP
3004         if (fOP->contains(thisChar)) {
3005             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3006             int tPos = prevPos;
3007             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3008                 tPos = fText->moveIndex32(tPos, -1);
3009             }
3010             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3011                 tPos = fText->moveIndex32(tPos, -1);
3012             }
3013             if (fQU->contains(fText->char32At(tPos))) {
3014                 continue;
3015             }
3016         }
3017
3018
3019
3020         // LB 16   (CL | CP) SP* x NS
3021         //    Scan backwards for SP* CM* (CL | CP)
3022         if (fNS->contains(thisChar)) {
3023             int tPos = prevPos;
3024             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3025                 tPos = fText->moveIndex32(tPos, -1);
3026             }
3027             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3028                 tPos = fText->moveIndex32(tPos, -1);
3029             }
3030             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3031                 continue;
3032             }
3033         }
3034
3035
3036         // LB 17        B2 SP* x B2
3037         if (fB2->contains(thisChar)) {
3038             //  Scan backwards, checking for the B2 CM* SP* sequence.
3039             tPos = prevPos;
3040             if (fSP->contains(prevChar)) {
3041                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3042                     tPos=fText->moveIndex32(tPos, -1);
3043                 }
3044             }
3045             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3046                 tPos=fText->moveIndex32(tPos, -1);
3047             }
3048             if (fB2->contains(fText->char32At(tPos))) {
3049                 continue;
3050             }
3051         }
3052
3053
3054         // LB 18    break after space
3055         if (fSP->contains(prevChar)) {
3056             break;
3057         }
3058
3059         // LB 19
3060         //    x   QU
3061         //    QU  x
3062         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3063             continue;
3064         }
3065
3066         // LB 20  Break around a CB
3067         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3068             break;
3069         }
3070
3071         // LB 20.09  Don't break between Hyphens and letters if a break precedes the hyphen.
3072         //           Formerly this was a Finnish tailoring.
3073         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3074         //    ^($HY | $HH) $AL;
3075         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3076                 prevPosX2 == -1) {
3077             continue;
3078         }
3079
3080         // LB 21
3081         if (fBA->contains(thisChar) ||
3082             fHY->contains(thisChar) ||
3083             fNS->contains(thisChar) ||
3084             fBB->contains(prevChar) )   {
3085             continue;
3086         }
3087
3088         // LB 21a
3089         //   HL (HY | BA) x
3090         if (fHL->contains(prevCharX2) &&
3091                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3092             continue;
3093         }
3094
3095         // LB 21b
3096         //   SY x HL
3097         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3098             continue;
3099         }
3100
3101         // LB 22
3102         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3103             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3104             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3105             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3106             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3107             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3108             continue;
3109         }
3110
3111
3112         // LB 23    (AL | HL) x NU
3113         //          NU x (AL | HL)
3114         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3115             continue;
3116         }
3117         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3118             continue;
3119         }
3120
3121         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3122         //      PR x (ID | EB | EM)
3123         //     (ID | EB | EM) x PO
3124         if (fPR->contains(prevChar) &&
3125                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3126             continue;
3127         }
3128         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3129                 fPO->contains(thisChar)) {
3130             continue;
3131         }
3132
3133         // LB 24  Do not break between prefix and letters or ideographs.
3134         //         (PR | PO) x (AL | HL)
3135         //         (AL | HL) x (PR | PO)
3136         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3137                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3138             continue;
3139         }
3140         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3141                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3142             continue;
3143         }
3144
3145         // LB 25 numbers match, moved up, before LB 8a,
3146
3147         // LB 26 Do not break a Korean syllable.
3148         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3149                                         fJV->contains(thisChar) ||
3150                                         fH2->contains(thisChar) ||
3151                                         fH3->contains(thisChar))) {
3152                                             continue;
3153                                         }
3154
3155         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3156             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3157                 continue;
3158         }
3159
3160         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3161             fJT->contains(thisChar)) {
3162                 continue;
3163         }
3164
3165         // LB 27 Treat a Korean Syllable Block the same as ID.
3166         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3167             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3168             fIN->contains(thisChar)) {
3169                 continue;
3170             }
3171         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3172             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3173             fPO->contains(thisChar)) {
3174                 continue;
3175             }
3176         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3177             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3178                 continue;
3179             }
3180
3181
3182
3183         // LB 28  Do not break between alphabetics ("at").
3184         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3185             continue;
3186         }
3187
3188         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3189         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3190             continue;
3191         }
3192
3193         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3194         //          (AL | NU) x OP
3195         //          CP x (AL | NU)
3196         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3197             continue;
3198         }
3199         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3200             continue;
3201         }
3202
3203         // LB30a    RI RI  ÷  RI
3204         //             RI  x  RI
3205         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3206             break;
3207         }
3208         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3209             // Two Regional Indicators have been paired.
3210             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3211             // following RI. This is a hack.
3212             thisChar = -1;
3213             continue;
3214         }
3215
3216         // LB30b    Emoji Base x Emoji Modifier
3217         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3218             continue;
3219         }
3220
3221         // LB 31    Break everywhere else
3222         break;
3223
3224     }
3225
3226     return pos;
3227 }
3228
3229
3230 UVector  *RBBILineMonkey::charClasses() {
3231     return fSets;
3232 }
3233
3234
3235 RBBILineMonkey::~RBBILineMonkey() {
3236     delete fSets;
3237
3238     delete fBK;
3239     delete fCR;
3240     delete fLF;
3241     delete fCM;
3242     delete fNL;
3243     delete fWJ;
3244     delete fZW;
3245     delete fGL;
3246     delete fCB;
3247     delete fSP;
3248     delete fB2;
3249     delete fBA;
3250     delete fBB;
3251     delete fHH;
3252     delete fHY;
3253     delete fH2;
3254     delete fH3;
3255     delete fCL;
3256     delete fCP;
3257     delete fEX;
3258     delete fIN;
3259     delete fJL;
3260     delete fJV;
3261     delete fJT;
3262     delete fNS;
3263     delete fOP;
3264     delete fQU;
3265     delete fIS;
3266     delete fNU;
3267     delete fPO;
3268     delete fPR;
3269     delete fSY;
3270     delete fAI;
3271     delete fAL;
3272     delete fCJ;
3273     delete fHL;
3274     delete fID;
3275     delete fRI;
3276     delete fSG;
3277     delete fXX;
3278     delete fEB;
3279     delete fEM;
3280     delete fZWJ;
3281
3282     delete fCharBI;
3283     delete fNumberMatcher;
3284 }
3285
3286
3287 //-------------------------------------------------------------------------------------------
3288 //
3289 //   TestMonkey
3290 //
3291 //     params
3292 //       seed=nnnnn        Random number starting seed.
3293 //                         Setting the seed allows errors to be reproduced.
3294 //       loop=nnn          Looping count.  Controls running time.
3295 //                         -1:  run forever.
3296 //                          0 or greater:  run length.
3297 //
3298 //       type = char | word | line | sent | title
3299 //
3300 //  Example:
3301 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3302 //
3303 //-------------------------------------------------------------------------------------------
3304
3305 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3306     int32_t val = defaultVal;
3307     name.append(" *= *(-?\\d+)");
3308     UErrorCode status = U_ZERO_ERROR;
3309     RegexMatcher m(name, params, 0, status);
3310     if (m.find()) {
3311         // The param exists.  Convert the string to an int.
3312         char valString[100];
3313         int32_t paramLength = m.end(1, status) - m.start(1, status);
3314         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3315             paramLength = (int32_t)(sizeof(valString)-2);
3316         }
3317         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3318         val = strtol(valString,  NULL, 10);
3319
3320         // Delete this parameter from the params string.
3321         m.reset();
3322         params = m.replaceFirst("", status);
3323     }
3324     U_ASSERT(U_SUCCESS(status));
3325     return val;
3326 }
3327 #endif
3328
3329 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3330 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3331                                     BreakIterator *bi,
3332                                     int expected[],
3333                                     int expectedcount)
3334 {
3335     int count = 0;
3336     int i = 0;
3337     int forward[50];
3338     bi->setText(ustr);
3339     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3340         forward[count] = i;
3341         if (count < expectedcount && expected[count] != i) {
3342             test->errln("%s:%d break forward test failed: expected %d but got %d",
3343                         __FILE__, __LINE__, expected[count], i);
3344             break;
3345         }
3346         count ++;
3347     }
3348     if (count != expectedcount) {
3349         printStringBreaks(ustr, expected, expectedcount);
3350         test->errln("%s:%d break forward test failed: missed %d match",
3351                     __FILE__, __LINE__, expectedcount - count);
3352         return;
3353     }
3354     // testing boundaries
3355     for (i = 1; i < expectedcount; i ++) {
3356         int j = expected[i - 1];
3357         if (!bi->isBoundary(j)) {
3358             printStringBreaks(ustr, expected, expectedcount);
3359             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3360                     __FILE__, __LINE__, j);
3361             return;
3362         }
3363         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3364             if (bi->isBoundary(j)) {
3365                 printStringBreaks(ustr, expected, expectedcount);
3366                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3367                     __FILE__, __LINE__, j);
3368                 return;
3369             }
3370         }
3371     }
3372
3373     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3374         count --;
3375         if (forward[count] != i) {
3376             printStringBreaks(ustr, expected, expectedcount);
3377             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3378                         __FILE__, __LINE__, forward[count], i);
3379             break;
3380         }
3381     }
3382     if (count != 0) {
3383         printStringBreaks(ustr, expected, expectedcount);
3384         test->errln("break test previous() failed: missed a match");
3385         return;
3386     }
3387
3388     // testing preceding
3389     for (i = 0; i < expectedcount - 1; i ++) {
3390         // int j = expected[i] + 1;
3391         int j = ustr.moveIndex32(expected[i], 1);
3392         for (; j <= expected[i + 1]; j ++) {
3393             int32_t expectedPreceding = expected[i];
3394             int32_t actualPreceding = bi->preceding(j);
3395             if (actualPreceding != expectedPreceding) {
3396                 printStringBreaks(ustr, expected, expectedcount);
3397                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3398                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3399                 return;
3400             }
3401         }
3402     }
3403 }
3404 #endif
3405
3406 void RBBITest::TestWordBreaks(void)
3407 {
3408 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3409
3410     Locale        locale("en");
3411     UErrorCode    status = U_ZERO_ERROR;
3412     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3413     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3414     // Replaced any C+J characters in a row with a random sequence of characters
3415     // of the same length to make our C+J segmentation not get in the way.
3416     static const char *strlist[] =
3417     {
3418     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3419     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3420     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3421     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3422     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3423     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3424     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3425     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3426     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3427     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3428     "\\u2027\\U000e0067\\u0a47\\u00b7",
3429     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3430     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3431     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3432     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3433     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3434     "\\u0027\\u11af\\U000e0057\\u0602",
3435     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3436     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3437     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3438     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3439     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3440     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3441     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3442     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3443     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3444     "\\u18f4\\U000e0049\\u20e7\\u2027",
3445     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3446     "\\ua183\\u102d\\u0bec\\u003a",
3447     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3448     "\\u003a\\u0e57\\u0fad\\u002e",
3449     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3450     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3451     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3452     "\\u003a\\u0664\\u00b7\\u1fba",
3453     "\\u003b\\u0027\\u00b7\\u47a3",
3454     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3455     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3456     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3457     };
3458     int loop;
3459     if (U_FAILURE(status)) {
3460         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3461         return;
3462     }
3463     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3464         // printf("looping %d\n", loop);
3465         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3466         // RBBICharMonkey monkey;
3467         RBBIWordMonkey monkey;
3468
3469         int expected[50];
3470         int expectedcount = 0;
3471
3472         monkey.setText(ustr);
3473         int i;
3474         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3475             expected[expectedcount ++] = i;
3476         }
3477
3478         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3479     }
3480     delete bi;
3481 #endif
3482 }
3483
3484 void RBBITest::TestWordBoundary(void)
3485 {
3486     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3487     Locale        locale("en");
3488     UErrorCode    status = U_ZERO_ERROR;
3489     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3490     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3491     if (U_FAILURE(status)) {
3492         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3493                 __FILE__, __LINE__, u_errorName(status));
3494         return;
3495     }
3496     UChar         str[50];
3497     static const char *strlist[] =
3498     {
3499     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3500     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3501     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3502     "\\u2027\\U000e0067\\u0a47\\u00b7",
3503     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3504     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3505     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3506     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3507     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3508     "\\u0027\\u11af\\U000e0057\\u0602",
3509     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3510     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3511     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3512     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3513     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3514     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3515     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3516     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3517     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3518     "\\u58f4\\U000e0049\\u20e7\\u2027",
3519     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3520     "\\ua183\\u102d\\u0bec\\u003a",
3521     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3522     "\\u003a\\u0e57\\u0fad\\u002e",
3523     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3524     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3525     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3526     "\\u003a\\u0664\\u00b7\\u1fba",
3527     "\\u003b\\u0027\\u00b7\\u47a3",
3528     };
3529     int loop;
3530     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3531         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3532         UnicodeString ustr(str);
3533         int forward[50];
3534         int count = 0;
3535
3536         bi->setText(ustr);
3537         int prev = -1;
3538         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3539             ++count;
3540             if (count >= UPRV_LENGTHOF(forward)) {
3541                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3542                         __FILE__, __LINE__, loop, count, boundary);
3543                 return;
3544             }
3545             forward[count] = boundary;
3546             if (boundary <= prev) {
3547                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3548                         __FILE__, __LINE__, loop, prev, boundary);
3549                 break;
3550             }
3551             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3552                 if (bi->isBoundary(nonBoundary)) {
3553                     printStringBreaks(ustr, forward, count);
3554                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3555                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3556                     return;
3557                 }
3558             }
3559             if (!bi->isBoundary(boundary)) {
3560                 printStringBreaks(ustr, forward, count);
3561                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3562                        __FILE__, __LINE__, boundary);
3563                 return;
3564             }
3565             prev = boundary;
3566         }
3567     }
3568 }
3569
3570 void RBBITest::TestLineBreaks(void)
3571 {
3572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3573     Locale        locale("en");
3574     UErrorCode    status = U_ZERO_ERROR;
3575     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3576     const int32_t  STRSIZE = 50;
3577     UChar         str[STRSIZE];
3578     static const char *strlist[] =
3579     {
3580      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3581      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3582              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3583      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3584              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3585      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3586      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3587      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3588      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3589      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3590      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3591      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3592      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3593      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3594      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3595      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3596      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3597      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3598      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3599      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3600      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3601      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3602      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3603      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3604      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3605      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3606      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3607      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3608      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3609      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3610      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3611      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3612      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3613      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3614      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3615      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3616      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3617      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3618      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3619          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3620     };
3621     int loop;
3622     TEST_ASSERT_SUCCESS(status);
3623     if (U_FAILURE(status)) {
3624         return;
3625     }
3626     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3627         // printf("looping %d\n", loop);
3628         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3629         if (t >= STRSIZE) {
3630             TEST_ASSERT(FALSE);
3631             continue;
3632         }
3633
3634
3635         UnicodeString ustr(str);
3636         RBBILineMonkey monkey;
3637         if (U_FAILURE(monkey.deferredStatus)) {
3638             continue;
3639         }
3640
3641         const int EXPECTEDSIZE = 50;
3642         int expected[EXPECTEDSIZE];
3643         int expectedcount = 0;
3644
3645         monkey.setText(ustr);
3646         int i;
3647         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3648             if (expectedcount >= EXPECTEDSIZE) {
3649                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3650                 return;
3651             }
3652             expected[expectedcount ++] = i;
3653         }
3654
3655         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3656     }
3657     delete bi;
3658 #endif
3659 }
3660
3661 void RBBITest::TestSentBreaks(void)
3662 {
3663 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3664     Locale        locale("en");
3665     UErrorCode    status = U_ZERO_ERROR;
3666     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3667     UChar         str[200];
3668     static const char *strlist[] =
3669     {
3670      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3671      "This\n",
3672      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3673      "\"Sentence ending with a quote.\" Bye.",
3674      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3675      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3676      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3677      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3678      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3679      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3680      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3681              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3682              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3683              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3684      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3685              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3686              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3687              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3688              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3689              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3690     };
3691     int loop;
3692     if (U_FAILURE(status)) {
3693         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3694         return;
3695     }
3696     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3697         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3698         UnicodeString ustr(str);
3699
3700         RBBISentMonkey monkey;
3701         if (U_FAILURE(monkey.deferredStatus)) {
3702             continue;
3703         }
3704
3705         const int EXPECTEDSIZE = 50;
3706         int expected[EXPECTEDSIZE];
3707         int expectedcount = 0;
3708
3709         monkey.setText(ustr);
3710         int i;
3711         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3712             if (expectedcount >= EXPECTEDSIZE) {
3713                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3714                 return;
3715             }
3716             expected[expectedcount ++] = i;
3717         }
3718
3719         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3720     }
3721     delete bi;
3722 #endif
3723 }
3724
3725 void RBBITest::TestMonkey() {
3726 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727
3728     UErrorCode     status    = U_ZERO_ERROR;
3729     int32_t        loopCount = 500;
3730     int32_t        seed      = 1;
3731     UnicodeString  breakType = "all";
3732     Locale         locale("en");
3733     UBool          useUText  = FALSE;
3734
3735     if (quick == FALSE) {
3736         loopCount = 10000;
3737     }
3738
3739     if (fTestParams) {
3740         UnicodeString p(fTestParams);
3741         loopCount = getIntParam("loop", p, loopCount);
3742         seed      = getIntParam("seed", p, seed);
3743
3744         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3745         if (m.find()) {
3746             breakType = m.group(1, status);
3747             m.reset();
3748             p = m.replaceFirst("", status);
3749         }
3750
3751         RegexMatcher u(" *utext", p, 0, status);
3752         if (u.find()) {
3753             useUText = TRUE;
3754             u.reset();
3755             p = u.replaceFirst("", status);
3756         }
3757
3758
3759         // m.reset(p);
3760         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3761             // Each option is stripped out of the option string as it is processed.
3762             // All options have been checked.  The option string should have been completely emptied..
3763             char buf[100];
3764             p.extract(buf, sizeof(buf), NULL, status);
3765             buf[sizeof(buf)-1] = 0;
3766             errln("Unrecognized or extra parameter:  %s\n", buf);
3767             return;
3768         }
3769
3770     }
3771
3772     if (breakType == "char" || breakType == "all") {
3773         RBBICharMonkey  m;
3774         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3775         if (U_SUCCESS(status)) {
3776             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3777             if (breakType == "all" && useUText==FALSE) {
3778                 // Also run a quick test with UText when "all" is specified
3779                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3780             }
3781         }
3782         else {
3783             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3784         }
3785         delete bi;
3786     }
3787
3788     if (breakType == "word" || breakType == "all") {
3789         logln("Word Break Monkey Test");
3790         RBBIWordMonkey  m;
3791         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3792         if (U_SUCCESS(status)) {
3793             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3794         }
3795         else {
3796             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3797         }
3798         delete bi;
3799     }
3800
3801     if (breakType == "line" || breakType == "all") {
3802         logln("Line Break Monkey Test");
3803         RBBILineMonkey  m;
3804         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3805         if (loopCount >= 10) {
3806             loopCount = loopCount / 5;   // Line break runs slower than the others.
3807         }
3808         if (U_SUCCESS(status)) {
3809             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3810         }
3811         else {
3812             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3813         }
3814         delete bi;
3815     }
3816
3817     if (breakType == "sent" || breakType == "all"  ) {
3818         logln("Sentence Break Monkey Test");
3819         RBBISentMonkey  m;
3820         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3821         if (loopCount >= 10) {
3822             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3823         }
3824         if (U_SUCCESS(status)) {
3825             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3826         }
3827         else {
3828             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3829         }
3830         delete bi;
3831     }
3832
3833 #endif
3834 }
3835
3836 //
3837 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3838 //    Parameters:
3839 //       bi      - the break iterator to use
3840 //       mk      - MonkeyKind, abstraction for obtaining expected results
3841 //       name    - Name of test (char, word, etc.) for use in error messages
3842 //       seed    - Seed for starting random number generator (parameter from user)
3843 //       numIterations
3844 //
3845 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3846                          int32_t numIterations, UBool useUText) {
3847
3848 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3849
3850     const int32_t    TESTSTRINGLEN = 500;
3851     UnicodeString    testText;
3852     int32_t          numCharClasses;
3853     UVector          *chClasses;
3854     int              expected[TESTSTRINGLEN*2 + 1];
3855     int              expectedCount = 0;
3856     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3857     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3858     char             reverseBreaks[TESTSTRINGLEN*2+1];
3859     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3860     char             followingBreaks[TESTSTRINGLEN*2+1];
3861     char             precedingBreaks[TESTSTRINGLEN*2+1];
3862     int              i;
3863     int              loopCount = 0;
3864
3865     m_seed = seed;
3866
3867     numCharClasses = mk.charClasses()->size();
3868     chClasses      = mk.charClasses();
3869
3870     // Check for errors that occured during the construction of the MonkeyKind object.
3871     //  Can't report them where they occured because errln() is a method coming from intlTest,
3872     //  and is not visible outside of RBBITest :-(
3873     if (U_FAILURE(mk.deferredStatus)) {
3874         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3875         return;
3876     }
3877
3878     // Verify that the character classes all have at least one member.
3879     for (i=0; i<numCharClasses; i++) {
3880         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3881         if (s == NULL || s->size() == 0) {
3882             errln("Character Class #%d is null or of zero size.", i);
3883             return;
3884         }
3885     }
3886
3887     while (loopCount < numIterations || numIterations == -1) {
3888         if (numIterations == -1 && loopCount % 10 == 0) {
3889             // If test is running in an infinite loop, display a periodic tic so
3890             //   we can tell that it is making progress.
3891             fprintf(stderr, ".");
3892         }
3893         // Save current random number seed, so that we can recreate the random numbers
3894         //   for this loop iteration in event of an error.
3895         seed = m_seed;
3896
3897         // Populate a test string with data.
3898         testText.truncate(0);
3899         for (i=0; i<TESTSTRINGLEN; i++) {
3900             int32_t  aClassNum = m_rand() % numCharClasses;
3901             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3902             int32_t   charIdx = m_rand() % classSet->size();
3903             UChar32   c = classSet->charAt(charIdx);
3904             if (c < 0) {   // TODO:  deal with sets containing strings.
3905                 errln("%s:%d c < 0", __FILE__, __LINE__);
3906                 break;
3907             }
3908             // Do not assemble a supplementary character from randomly generated separate surrogates.
3909             //   (It could be a dictionary character)
3910             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3911                 continue;
3912             }
3913
3914             testText.append(c);
3915         }
3916
3917         // Calculate the expected results for this test string.
3918         mk.setText(testText);
3919         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3920         expectedBreaks[0] = 1;
3921         int32_t breakPos = 0;
3922         expectedCount = 0;
3923         for (;;) {
3924             breakPos = mk.next(breakPos);
3925             if (breakPos == -1) {
3926                 break;
3927             }
3928             if (breakPos > testText.length()) {
3929                 errln("breakPos > testText.length()");
3930             }
3931             expectedBreaks[breakPos] = 1;
3932             U_ASSERT(expectedCount<testText.length());
3933             expected[expectedCount ++] = breakPos;
3934             (void)expected;   // Set but not used warning.
3935                               // TODO (andy): check it out.
3936         }
3937
3938         // Find the break positions using forward iteration
3939         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3940         if (useUText) {
3941             UErrorCode status = U_ZERO_ERROR;
3942             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3943             // testUText = utext_openUnicodeString(testUText, &testText, &status);
3944             bi->setText(testUText, status);
3945             TEST_ASSERT_SUCCESS(status);
3946             utext_close(testUText);   // The break iterator does a shallow clone of the UText
3947                                       //  This UText can be closed immediately, so long as the
3948                                       //  testText string continues to exist.
3949         } else {
3950             bi->setText(testText);
3951         }
3952
3953         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3954             if (i < 0 || i > testText.length()) {
3955                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3956                 break;
3957             }
3958             forwardBreaks[i] = 1;
3959         }
3960
3961         // Find the break positions using reverse iteration
3962         memset(reverseBreaks, 0, sizeof(reverseBreaks));
3963         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3964             if (i < 0 || i > testText.length()) {
3965                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3966                 break;
3967             }
3968             reverseBreaks[i] = 1;
3969         }
3970
3971         // Find the break positions using isBoundary() tests.
3972         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3973         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3974         for (i=0; i<=testText.length(); i++) {
3975             isBoundaryBreaks[i] = bi->isBoundary(i);
3976         }
3977
3978
3979         // Find the break positions using the following() function.
3980         // printf(".");
3981         memset(followingBreaks, 0, sizeof(followingBreaks));
3982         int32_t   lastBreakPos = 0;
3983         followingBreaks[0] = 1;
3984         for (i=0; i<testText.length(); i++) {
3985             breakPos = bi->following(i);
3986             if (breakPos <= i ||
3987                 breakPos < lastBreakPos ||
3988                 breakPos > testText.length() ||
3989                 (breakPos > lastBreakPos && lastBreakPos > i)) {
3990                 errln("%s break monkey test: "
3991                     "Out of range value returned by BreakIterator::following().\n"
3992                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
3993                          name, seed, i, breakPos, lastBreakPos);
3994                 break;
3995             }
3996             followingBreaks[breakPos] = 1;
3997             lastBreakPos = breakPos;
3998         }
3999
4000         // Find the break positions using the preceding() function.
4001         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4002         lastBreakPos = testText.length();
4003         precedingBreaks[testText.length()] = 1;
4004         for (i=testText.length(); i>0; i--) {
4005             breakPos = bi->preceding(i);
4006             if (breakPos >= i ||
4007                 breakPos > lastBreakPos ||
4008                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4009                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4010                 errln("%s break monkey test: "
4011                     "Out of range value returned by BreakIterator::preceding().\n"
4012                     "index=%d;  prev returned %d; lastBreak=%d" ,
4013                     name,  i, breakPos, lastBreakPos);
4014                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4015                     precedingBreaks[i] = 2;   // Forces an error.
4016                 }
4017             } else {
4018                 if (breakPos >= 0) {
4019                     precedingBreaks[breakPos] = 1;
4020                 }
4021                 lastBreakPos = breakPos;
4022             }
4023         }
4024
4025         // Compare the expected and actual results.
4026         for (i=0; i<=testText.length(); i++) {
4027             const char *errorType = NULL;
4028             if  (forwardBreaks[i] != expectedBreaks[i]) {
4029                 errorType = "next()";
4030             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4031                 errorType = "previous()";
4032             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4033                 errorType = "isBoundary()";
4034             } else if (followingBreaks[i] != expectedBreaks[i]) {
4035                 errorType = "following()";
4036             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4037                 errorType = "preceding()";
4038             }
4039
4040
4041             if (errorType != NULL) {
4042                 // Format a range of the test text that includes the failure as
4043                 //  a data item that can be included in the rbbi test data file.
4044
4045                 // Start of the range is the last point where expected and actual results
4046                 //   both agreed that there was a break position.
4047                 int startContext = i;
4048                 int32_t count = 0;
4049                 for (;;) {
4050                     if (startContext==0) { break; }
4051                     startContext --;
4052                     if (expectedBreaks[startContext] != 0) {
4053                         if (count == 2) break;
4054                         count ++;
4055                     }
4056                 }
4057
4058                 // End of range is two expected breaks past the start position.
4059                 int endContext = i + 1;
4060                 int ci;
4061                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4062                     for (;;) {
4063                         if (endContext >= testText.length()) {break;}
4064                         if (expectedBreaks[endContext-1] != 0) {
4065                             if (count == 0) break;
4066                             count --;
4067                         }
4068                         endContext ++;
4069                     }
4070                 }
4071
4072                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4073                 UnicodeString errorText = "<data>";
4074                 /***if (strcmp(errorType, "next()") == 0) {
4075                     startContext = 0;
4076                     endContext = testText.length();
4077
4078                     printStringBreaks(testText, expected, expectedCount);
4079                 }***/
4080
4081                 for (ci=startContext; ci<endContext;) {
4082                     UnicodeString hexChars("0123456789abcdef");
4083                     UChar32  c;
4084                     int      bn;
4085                     c = testText.char32At(ci);
4086                     if (ci == i) {
4087                         // This is the location of the error.
4088                         errorText.append("<?>");
4089                     } else if (expectedBreaks[ci] != 0) {
4090                         // This a non-error expected break position.
4091                         errorText.append("\\");
4092                     }
4093                     if (c < 0x10000) {
4094                         errorText.append("\\u");
4095                         for (bn=12; bn>=0; bn-=4) {
4096                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4097                         }
4098                     } else {
4099                         errorText.append("\\U");
4100                         for (bn=28; bn>=0; bn-=4) {
4101                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4102                         }
4103                     }
4104                     ci = testText.moveIndex32(ci, 1);
4105                 }
4106                 errorText.append("\\");
4107                 errorText.append("</data>\n");
4108
4109                 // Output the error
4110                 char  charErrorTxt[500];
4111                 UErrorCode status = U_ZERO_ERROR;
4112                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4113                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4114                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4115
4116                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4117                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4118                     errorType, seed, i, charErrorTxt);
4119                 break;
4120             }
4121         }
4122
4123         loopCount++;
4124     }
4125 #endif
4126 }
4127
4128
4129 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4130 //             This test checks the initial patch,
4131 //             which is to just keep it from crashing.  Correct word boundaries
4132 //             await a proper fix to the dictionary code.
4133 //
4134 void RBBITest::TestBug5532(void)  {
4135    // Text includes a mixture of Thai and Latin.
4136    const unsigned char utf8Data[] = {
4137            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4138            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4139            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4140            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4141            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4142            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4143            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4144            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4145            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4146            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4147            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4148
4149     UErrorCode status = U_ZERO_ERROR;
4150     UText utext=UTEXT_INITIALIZER;
4151     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4152     TEST_ASSERT_SUCCESS(status);
4153
4154     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4155     TEST_ASSERT_SUCCESS(status);
4156     if (U_SUCCESS(status)) {
4157         bi->setText(&utext, status);
4158         TEST_ASSERT_SUCCESS(status);
4159
4160         int32_t breakCount = 0;
4161         int32_t previousBreak = -1;
4162         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4163             // For now, just make sure that the break iterator doesn't hang.
4164             TEST_ASSERT(previousBreak < bi->current());
4165             previousBreak = bi->current();
4166         }
4167         TEST_ASSERT(breakCount > 0);
4168     }
4169     delete bi;
4170     utext_close(&utext);
4171 }
4172
4173
4174 void RBBITest::TestBug9983(void)  {
4175     UnicodeString text = UnicodeString("\\u002A"  // * Other
4176                                        "\\uFF65"  //   Other
4177                                        "\\u309C"  //   Katakana
4178                                        "\\uFF9F"  //   Extend
4179                                        "\\uFF65"  //   Other
4180                                        "\\u0020"  //   Other
4181                                        "\\u0000").unescape();
4182
4183     UErrorCode status = U_ZERO_ERROR;
4184     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4185         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4186     TEST_ASSERT_SUCCESS(status);
4187     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4188         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4189     TEST_ASSERT_SUCCESS(status);
4190     if (U_FAILURE(status)) {
4191         return;
4192     }
4193     int32_t offset, rstatus, iterationCount;
4194
4195     brkiter->setText(text);
4196     brkiter->last();
4197     iterationCount = 0;
4198     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4199         iterationCount++;
4200         rstatus = brkiter->getRuleStatus();
4201         (void)rstatus;     // Suppress set but not used warning.
4202         if (iterationCount >= 10) {
4203            break;
4204         }
4205     }
4206     TEST_ASSERT(iterationCount == 6);
4207
4208     brkiterPOSIX->setText(text);
4209     brkiterPOSIX->last();
4210     iterationCount = 0;
4211     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4212         iterationCount++;
4213         rstatus = brkiterPOSIX->getRuleStatus();
4214         (void)rstatus;     // Suppress set but not used warning.
4215         if (iterationCount >= 10) {
4216            break;
4217         }
4218     }
4219     TEST_ASSERT(iterationCount == 6);
4220 }
4221
4222 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4223 //
4224 void RBBITest::TestBug7547() {
4225     UnicodeString rules;
4226     UErrorCode status = U_ZERO_ERROR;
4227     UParseError parseError;
4228     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4229     if (status != U_BRK_RULE_SYNTAX) {
4230         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4231     }
4232     if (parseError.line != 1 || parseError.offset != 0) {
4233         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4234     }
4235 }
4236
4237
4238 void RBBITest::TestBug12797() {
4239     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4240     UErrorCode status = U_ZERO_ERROR;
4241     UParseError parseError;
4242     RuleBasedBreakIterator bi(rules, parseError, status);
4243     if (U_FAILURE(status)) {
4244         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4245         return;
4246     }
4247     UnicodeString text = "abc";
4248     bi.setText(text);
4249     bi.first();
4250     int32_t boundary = bi.next();
4251     if (boundary != 3) {
4252         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4253     }
4254 }
4255
4256 void RBBITest::TestBug12918() {
4257     // This test triggers an assertion failure in dictbe.cpp
4258     const UChar *crasherString = u"\u3325\u4a16";
4259     UErrorCode status = U_ZERO_ERROR;
4260     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4261     if (U_FAILURE(status)) {
4262         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4263         return;
4264     }
4265     ubrk_first(iter);
4266     int32_t pos = 0;
4267     int32_t lastPos = -1;
4268     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4269         if (pos <= lastPos) {
4270             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4271             break;
4272         }
4273     }
4274     ubrk_close(iter);
4275 }
4276
4277 void RBBITest::TestBug12932() {
4278     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4279     UnicodeString ruleStr(
4280             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4281             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4282             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4283             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4284             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4285             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4286
4287     UErrorCode status = U_ZERO_ERROR;
4288     UParseError parseError;
4289     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4290     if (status != U_BRK_RULE_SYNTAX) {
4291         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4292                 __FILE__, __LINE__, u_errorName(status));
4293     }
4294 }
4295
4296
4297 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4298 //             remain undevided by ICU char, word and line break.
4299 void RBBITest::TestEmoji() {
4300 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4301     UErrorCode  status = U_ZERO_ERROR;
4302
4303     CharString testFileName;
4304     testFileName.append(IntlTest::getSourceTestData(status), status);
4305     testFileName.appendPathPart("emoji-test.txt", status);
4306     if (U_FAILURE(status)) {
4307         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4308         return;
4309     }
4310     logln("Opening data file %s\n", testFileName.data());
4311
4312     int    len;
4313     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4314     if (U_FAILURE(status) || testFile == NULL) {
4315         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4316         return;
4317     }
4318     UnicodeString testFileAsString(testFile, len);
4319     delete [] testFile;
4320
4321     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4322     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4323     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4324     int32_t lineNumber = 0;
4325
4326     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4327     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4328     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4329     if (U_FAILURE(status)) {
4330         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4331         return;
4332     }
4333
4334     while (lineMatcher.find()) {
4335         ++lineNumber;
4336         UnicodeString line = lineMatcher.group(status);
4337         hexMatcher.reset(line);
4338         UnicodeString testString;   // accumulates the emoji sequence.
4339         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4340             UnicodeString hex = hexMatcher.group(1, status);
4341             if (hex.length() > 8) {
4342                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4343                 break;
4344             }
4345             CharString hex8;
4346             hex8.appendInvariantChars(hex, status);
4347             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4348             if (c<=0x10ffff) {
4349                 testString.append(c);
4350             } else {
4351                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4352                         __FILE__, __LINE__, lineNumber, hex8.data());
4353                 break;
4354             }
4355         }
4356
4357         if (testString.length() > 1) {
4358             charBreaks->setText(testString);
4359             charBreaks->first();
4360             int32_t firstBreak = charBreaks->next();
4361             if (testString.length() != firstBreak) {
4362                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4363                         __FILE__, __LINE__, lineNumber, firstBreak);
4364             }
4365             wordBreaks->setText(testString);
4366             wordBreaks->first();
4367             firstBreak = wordBreaks->next();
4368             if (testString.length() != firstBreak) {
4369                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4370                         __FILE__, __LINE__, lineNumber, firstBreak);
4371             }
4372             lineBreaks->setText(testString);
4373             lineBreaks->first();
4374             firstBreak = lineBreaks->next();
4375             if (testString.length() != firstBreak) {
4376                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4377                         __FILE__, __LINE__, lineNumber, firstBreak);
4378             }
4379         }
4380     }
4381 #endif
4382 }
4383
4384
4385 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4386
4387 void RBBITest::TestBug12519() {
4388     UErrorCode status = U_ZERO_ERROR;
4389     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4390     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4391     if (!assertSuccess(WHERE, status)) {
4392         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4393         return;
4394     }
4395     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4396
4397     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4398     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4399
4400     LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4401     assertTrue(WHERE, *biEn == *cloneEn);
4402     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4403
4404     LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4405     assertTrue(WHERE, *biFr == *cloneFr);
4406     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4407
4408     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4409     UnicodeString text("Hallo Welt");
4410     biDe->setText(text);
4411     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4412     *biDe = *biFr;
4413     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4414 }
4415
4416 void RBBITest::TestBug12677() {
4417     // Check that stripping of comments from rules for getRules() is not confused by
4418     // the presence of '#' characters in the rules that do not introduce comments.
4419     UnicodeString rules(u"!!forward; \n"
4420                          "$x = [ab#];  # a set with a # literal. \n"
4421                          " # .;        # a comment that looks sort of like a rule.   \n"
4422                          " '#' '?';    # a rule with a quoted #   \n"
4423                        );
4424
4425     UErrorCode status = U_ZERO_ERROR;
4426     UParseError pe;
4427     RuleBasedBreakIterator bi(rules, pe, status);
4428     assertSuccess(WHERE, status);
4429     UnicodeString rtRules = bi.getRules();
4430     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
4431 }
4432
4433
4434 void RBBITest::TestTableRedundancies() {
4435     UErrorCode status = U_ZERO_ERROR;
4436
4437     LocalPointer<RuleBasedBreakIterator> bi (
4438         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4439     assertSuccess(WHERE, status);
4440     if (U_FAILURE(status)) return;
4441
4442     RBBIDataWrapper *dw = bi->fData;
4443     const RBBIStateTable *fwtbl = dw->fForwardTable;
4444     int32_t numCharClasses = dw->fHeader->fCatCount;
4445     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4446
4447     // Check for duplicate columns (character categories)
4448
4449     std::vector<UnicodeString> columns;
4450     for (int32_t column = 0; column < numCharClasses; column++) {
4451         UnicodeString s;
4452         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4453             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4454             s.append(row->fNextState[column]);
4455         }
4456         columns.push_back(s);
4457     }
4458     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4459     for (int c1=1; c1<numCharClasses; c1++) {
4460         for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4461             if (columns.at(c1) == columns.at(c2)) {
4462                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4463                 goto out;
4464             }
4465         }
4466     }
4467   out:
4468
4469     // Check for duplicate states
4470     std::vector<UnicodeString> rows;
4471     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4472         UnicodeString s;
4473         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4474         assertTrue(WHERE, row->fAccepting >= -1);
4475         s.append(row->fAccepting + 1);   // values of -1 are expected.
4476         s.append(row->fLookAhead);
4477         s.append(row->fTagIdx);
4478         for (int32_t column = 0; column < numCharClasses; column++) {
4479             s.append(row->fNextState[column]);
4480         }
4481         rows.push_back(s);
4482     }
4483     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4484         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4485             if (rows.at(r1) == rows.at(r2)) {
4486                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4487                 return;
4488             }
4489         }
4490     }
4491 }
4492
4493 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4494 //            even after next() has returned DONE.
4495
4496 void RBBITest::TestBug13447() {
4497     UErrorCode status = U_ZERO_ERROR;
4498     LocalPointer<RuleBasedBreakIterator> bi(
4499         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4500     assertSuccess(WHERE, status);
4501     if (U_FAILURE(status)) return;
4502     UnicodeString data(u"1234");
4503     bi->setText(data);
4504     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4505     assertEquals(WHERE, 4, bi->next());
4506     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4507     assertEquals(WHERE, UBRK_DONE, bi->next());
4508     assertEquals(WHERE, 4, bi->current());
4509     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4510 }
4511
4512 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4513 //  for filling the break iterator cache when starting from random positions
4514 //  in the text.
4515 //
4516 //  It's a monkey test, working on random data, with the expected data obtained
4517 //  from forward iteration (no safe rules involved), comparing with results
4518 //  when indexing into the interior of the string (safe rules needed).
4519
4520 void RBBITest::TestReverse() {
4521     UErrorCode status = U_ZERO_ERROR;
4522
4523     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4524             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4525     assertSuccess(WHERE, status, true);
4526     status = U_ZERO_ERROR;
4527     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4528             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4529     assertSuccess(WHERE, status, true);
4530     status = U_ZERO_ERROR;
4531     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4532             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4533     assertSuccess(WHERE, status, true);
4534     status = U_ZERO_ERROR;
4535     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4536             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4537     assertSuccess(WHERE, status, true);
4538 }
4539
4540 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4541     if (!bi) {
4542         return;
4543     }
4544
4545     // From the mapping trie in the break iterator's internal data, create a
4546     // vector of UnicodeStrings, one for each character category, containing
4547     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4548     // to avoid an execess of unassigned code points.
4549
4550     RBBIDataWrapper *data = bi->fData;
4551     int32_t categoryCount = data->fHeader->fCatCount;
4552     UTrie2  *trie = data->fTrie;
4553
4554     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4555     for (int cp=0; cp<0x1fff0; ++cp) {
4556         int cat = utrie2_get32(trie, cp);
4557         cat &= ~0x4000;    // And off the dictionary bit from the category.
4558         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4559         if (cat < 0 || cat >= categoryCount) return;
4560         strings[cat].append(cp);
4561     }
4562
4563     icu_rand randomGen;
4564     const int testStringLength = 10000;
4565     UnicodeString testString;
4566
4567     for (int i=0; i<testStringLength; ++i) {
4568         int charClass = randomGen() % categoryCount;
4569         if (strings[charClass].length() > 0) {
4570             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4571             testString.append(cp);
4572         }
4573     }
4574
4575     typedef std::pair<UBool, int32_t> Result;
4576     std::vector<Result> expectedResults;
4577     bi->setText(testString);
4578     for (int i=0; i<testString.length(); ++i) {
4579         bool isboundary = bi->isBoundary(i);
4580         int  ruleStatus = bi->getRuleStatus();
4581         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4582     }
4583
4584     for (int i=testString.length()-1; i>=0; --i) {
4585         bi->setText(testString);   // clears the internal break cache
4586         Result expected = expectedResults[i];
4587         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4588         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4589     }
4590 }
4591
4592
4593 // Ticket 13692 - finding word boundaries in very large numbers or words could
4594 //                be very time consuming. When the problem was present, this void test
4595 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4596
4597 void RBBITest::TestBug13692() {
4598     UErrorCode status = U_ZERO_ERROR;
4599     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4600             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4601     if (!assertSuccess(WHERE, status, true)) {
4602         return;
4603     }
4604     constexpr int32_t LENGTH = 1000000;
4605     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4606     for (int i=0; i<20; i+=2) {
4607         longNumber.setCharAt(i, u' ');
4608     }
4609     bi->setText(longNumber);
4610     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4611     assertSuccess(WHERE, status);
4612 }
4613
4614 //
4615 //  TestDebug    -  A place-holder test for debugging purposes.
4616 //                  For putting in fragments of other tests that can be invoked
4617 //                  for tracing  without a lot of unwanted extra stuff happening.
4618 //
4619 void RBBITest::TestDebug(void) {
4620     UErrorCode status = U_ZERO_ERROR;
4621     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4622             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4623     if (!assertSuccess(WHERE, status, true)) {
4624         return;
4625     }
4626     const UnicodeString &rules = bi->getRules();
4627     UParseError pe;
4628     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4629     assertSuccess(WHERE, status);
4630 }
4631
4632 void RBBITest::TestProperties() {
4633     UErrorCode errorCode = U_ZERO_ERROR;
4634     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4635     if (!prependSet.isEmpty()) {
4636         errln(
4637             "[:GCB=Prepend:] is not empty any more. "
4638             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4639             "change this test to the opposite condition.");
4640     }
4641 }
4642
4643 #endif // #if !UCONFIG_NO_BREAK_ITERATION