icuSources/test/intltest/rbbitst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1999-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /************************************************************************
   9 *   Date        Name        Description
  10 *   12/15/99    Madhu        Creation.
  11 *   01/12/2000  Madhu        Updated for changed API and added new tests
  12 ************************************************************************/
  13
  14 #include "unicode/utypes.h"
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <utility>
  21 #include <vector>
  22
  23 #include "unicode/brkiter.h"
  24 #include "unicode/localpointer.h"
  25 #include "unicode/numfmt.h"
  26 #include "unicode/rbbi.h"
  27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  28 #include "unicode/regex.h"
  29 #endif
  30 #include "unicode/schriter.h"
  31 #include "unicode/uchar.h"
  32 #include "unicode/utf16.h"
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uniset.h"
  35 #include "unicode/uscript.h"
  36 #include "unicode/ustring.h"
  37 #include "unicode/utext.h"
  38
  39 #include "charstr.h"
  40 #include "cmemory.h"
  41 #include "cstr.h"
  42 #include "intltest.h"
  43 #include "rbbitst.h"
  44 #include "rbbidata.h"
  45 #include "utypeinfo.h"  // for 'typeid' to work
  46 #include "uvector.h"
  47 #include "uvectr32.h"
  48
  49 // Needed for Apple perf tests <rdar://problem/51193810>
  50 #include <unistd.h>
  51 #include <mach/mach_time.h>
  52
  53
  54 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  55 #include "unicode/filteredbrk.h"
  56 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  57
  58 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
  59     if (!(x)) { \
  60         errln("Failure in file %s, line %d", __FILE__, __LINE__); \
  61     } \
  62 } UPRV_BLOCK_MACRO_END
  63
  64 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
  65     if (U_FAILURE(errcode)) { \
  66         errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode)); \
  67     } \
  68 } UPRV_BLOCK_MACRO_END
  69
  70 #define MONKEY_ERROR(msg, fRuleFileName, index, seed) { \
  71     IntlTest::gTest->errln("\n%s:%d %s at index %d. Parameters to reproduce: @\"type=%s seed=%u loop=1\"", \
  72                     __FILE__, __LINE__, msg, index, fRuleFileName, seed); \
  73 }
  74
  75 //---------------------------------------------
  76 // runIndexedTest
  77 //---------------------------------------------
  78
  79
  80 //  Note:  Before adding new tests to this file, check whether the desired test data can
  81 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  82 //         it's much less work than writing a new test, diagnostic output in the event of failures
  83 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  84 //         will run there as well, without additional effort.
  85
  86 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  87 {
  88     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  89     fTestParams = params;
  90
  91     TESTCASE_AUTO_BEGIN;
  92 #if !UCONFIG_NO_FILE_IO
  93     TESTCASE_AUTO(TestBug4153072);
  94 #endif
  95 #if !UCONFIG_NO_FILE_IO
  96     TESTCASE_AUTO(TestUnicodeFiles);
  97 #endif
  98     TESTCASE_AUTO(TestGetAvailableLocales);
  99     TESTCASE_AUTO(TestGetDisplayName);
 100 #if !UCONFIG_NO_FILE_IO
 101     TESTCASE_AUTO(TestEndBehaviour);
 102     TESTCASE_AUTO(TestWordBreaks);
 103     TESTCASE_AUTO(TestWordBoundary);
 104     TESTCASE_AUTO(TestLineBreaks);
 105     TESTCASE_AUTO(TestSentBreaks);
 106     TESTCASE_AUTO(TestExtended);
 107 #endif
 108 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 109     TESTCASE_AUTO(TestMonkey);
 110 #endif
 111 #if !UCONFIG_NO_FILE_IO
 112     TESTCASE_AUTO(TestBug3818);
 113 #endif
 114     TESTCASE_AUTO(TestDebug);
 115 #if !UCONFIG_NO_FILE_IO
 116     TESTCASE_AUTO(TestBug5775);
 117 #endif
 118     TESTCASE_AUTO(TestBug9983);
 119     TESTCASE_AUTO(TestDictRules);
 120     TESTCASE_AUTO(TestBug5532);
 121     TESTCASE_AUTO(TestBug7547);
 122     TESTCASE_AUTO(TestBug12797);
 123     TESTCASE_AUTO(TestBug12918);
 124     TESTCASE_AUTO(TestBug12932);
 125     TESTCASE_AUTO(TestEmoji);
 126     TESTCASE_AUTO(TestBug12519);
 127     TESTCASE_AUTO(TestBug12677);
 128     TESTCASE_AUTO(TestTableRedundancies);
 129     TESTCASE_AUTO(TestBug13447);
 130     TESTCASE_AUTO(TestReverse);
 131     TESTCASE_AUTO(TestBug13692);
 132     TESTCASE_AUTO_END;
 133 }
 134
 135
 136 //--------------------------------------------------------------------------------------
 137 //
 138 //    RBBITest    constructor and destructor
 139 //
 140 //--------------------------------------------------------------------------------------
 141
 142 RBBITest::RBBITest() {
 143     fTestParams = NULL;
 144 }
 145
 146
 147 RBBITest::~RBBITest() {
 148 }
 149
 150
 151 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 152     UErrorCode status = U_ZERO_ERROR;
 153     char name[100];
 154     printf("code    alpha extend alphanum type word sent line name\n");
 155     int nextExpectedIndex = 0;
 156     utext_setNativeIndex(tstr, 0);
 157     for (int j = 0; j < static_cast<int>(utext_nativeLength(tstr)); j=static_cast<int>(utext_getNativeIndex(tstr))) {
 158         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 159             printf("------------------------------------------------ %d\n", j);
 160             ++nextExpectedIndex;
 161         }
 162
 163         UChar32 c = utext_next32(tstr);
 164         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 165         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 166                            u_isUAlphabetic(c),
 167                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 168                            u_isalnum(c),
 169                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 170                                                   u_charType(c),
 171                                                   U_SHORT_PROPERTY_NAME),
 172                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 173                                                   u_getIntPropertyValue(c,
 174                                                           UCHAR_WORD_BREAK),
 175                                                   U_SHORT_PROPERTY_NAME),
 176                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 177                                    u_getIntPropertyValue(c,
 178                                            UCHAR_SENTENCE_BREAK),
 179                                    U_SHORT_PROPERTY_NAME),
 180                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 181                                    u_getIntPropertyValue(c,
 182                                            UCHAR_LINE_BREAK),
 183                                    U_SHORT_PROPERTY_NAME),
 184                            name);
 185     }
 186 }
 187
 188
 189 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 190    UErrorCode status = U_ZERO_ERROR;
 191    UText *tstr = NULL;
 192    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 193    if (U_FAILURE(status)) {
 194        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 195        return;
 196     }
 197    printStringBreaks(tstr, expected, expectedCount);
 198    utext_close(tstr);
 199 }
 200
 201
 202 void RBBITest::TestBug3818() {
 203     UErrorCode  status = U_ZERO_ERROR;
 204
 205     // Four Thai words...
 206     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 207                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 208     UnicodeString  thaiStr(thaiWordData);
 209
 210     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 211     if (U_FAILURE(status) || bi == NULL) {
 212         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 213         return;
 214     }
 215     bi->setText(thaiStr);
 216
 217     int32_t  startOfSecondWord = bi->following(1);
 218     if (startOfSecondWord != 4) {
 219         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 220             __FILE__, __LINE__, startOfSecondWord);
 221     }
 222     startOfSecondWord = bi->following(0);
 223     if (startOfSecondWord != 4) {
 224         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 225             __FILE__, __LINE__, startOfSecondWord);
 226     }
 227     delete bi;
 228 }
 229
 230
 231 //---------------------------------------------
 232 //
 233 //     other tests
 234 //
 235 //---------------------------------------------
 236
 237 void RBBITest::TestGetAvailableLocales()
 238 {
 239     int32_t locCount = 0;
 240     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 241
 242     if (locCount == 0)
 243         dataerrln("getAvailableLocales() returned an empty list!");
 244     // Just make sure that it's returning good memory.
 245     int32_t i;
 246     for (i = 0; i < locCount; ++i) {
 247         logln(locList[i].getName());
 248     }
 249 }
 250
 251 //Testing the BreakIterator::getDisplayName() function
 252 void RBBITest::TestGetDisplayName()
 253 {
 254     UnicodeString   result;
 255
 256     BreakIterator::getDisplayName(Locale::getUS(), result);
 257     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 258         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 259                 + result);
 260
 261     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 262     if (result != "French (France)")
 263         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 264                 + result);
 265 }
 266 /**
 267  * Test End Behaviour
 268  * @bug 4068137
 269  */
 270 void RBBITest::TestEndBehaviour()
 271 {
 272     UErrorCode status = U_ZERO_ERROR;
 273     UnicodeString testString("boo.");
 274     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 275     if (U_FAILURE(status))
 276     {
 277         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 278         return;
 279     }
 280     wb->setText(testString);
 281
 282     if (wb->first() != 0)
 283         errln("Didn't get break at beginning of string.");
 284     if (wb->next() != 3)
 285         errln("Didn't get break before period in \"boo.\"");
 286     if (wb->current() != 4 && wb->next() != 4)
 287         errln("Didn't get break at end of string.");
 288     delete wb;
 289 }
 290 /*
 291  * @bug 4153072
 292  */
 293 void RBBITest::TestBug4153072() {
 294     UErrorCode status = U_ZERO_ERROR;
 295     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 296     if (U_FAILURE(status))
 297     {
 298         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 299         return;
 300     }
 301     UnicodeString str("...Hello, World!...");
 302     int32_t begin = 3;
 303     int32_t end = str.length() - 3;
 304     UBool onBoundary;
 305
 306     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 307     iter->adoptText(textIterator);
 308     int index;
 309     // Note: with the switch to UText, there is no way to restrict the
 310     //       iteration range to begin at an index other than zero.
 311     //       String character iterators created with a non-zero bound are
 312     //         treated by RBBI as being empty.
 313     for (index = -1; index < begin + 1; ++index) {
 314         onBoundary = iter->isBoundary(index);
 315         if (index == 0?  !onBoundary : onBoundary) {
 316             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 317                             " and begin index = " + begin);
 318         }
 319     }
 320     delete iter;
 321 }
 322
 323
 324 //
 325 // Test for problem reported by Ashok Matoria on 9 July 2007
 326 //    One.<kSoftHyphen><kSpace>Two.
 327 //
 328 //    Sentence break at start (0) and then on calling next() it breaks at
 329 //   'T' of "Two". Now, at this point if I do next() and
 330 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 331 //
 332 void RBBITest::TestBug5775() {
 333     UErrorCode status = U_ZERO_ERROR;
 334     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 335     TEST_ASSERT_SUCCESS(status);
 336     if (U_FAILURE(status)) {
 337         return;
 338     }
 339 // Check for status first for better handling of no data errors.
 340     TEST_ASSERT(bi != NULL);
 341     if (bi == NULL) {
 342         return;
 343     }
 344
 345     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 346     //               01234      56789
 347     s = s.unescape();
 348     bi->setText(s);
 349     int pos = bi->next();
 350     TEST_ASSERT(pos == 6);
 351     pos = bi->next();
 352     TEST_ASSERT(pos == 10);
 353     pos = bi->previous();
 354     TEST_ASSERT(pos == 6);
 355     delete bi;
 356 }
 357
 358
 359
 360 //------------------------------------------------------------------------------
 361 //
 362 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 363 //
 364 //------------------------------------------------------------------------------
 365
 366 struct TestParams {
 367     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 368                                            //   Changed out whenever test data changes break type.
 369
 370     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 371     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 372     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 373     UVector32       *srcCol;
 374
 375     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 376     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 377     CharString       utf8String;           // UTF-8 form of text to break.
 378
 379     TestParams(UErrorCode &status) : dataToBreak() {
 380         bi               = NULL;
 381         expectedBreaks   = new UVector32(status);
 382         srcLine          = new UVector32(status);
 383         srcCol           = new UVector32(status);
 384         textToBreak      = NULL;
 385         textMap          = new UVector32(status);
 386     }
 387
 388     ~TestParams() {
 389         delete bi;
 390         delete expectedBreaks;
 391         delete srcLine;
 392         delete srcCol;
 393         utext_close(textToBreak);
 394         delete textMap;
 395     }
 396
 397     int32_t getSrcLine(int32_t bp);
 398     int32_t getExpectedBreak(int32_t bp);
 399     int32_t getSrcCol(int32_t bp);
 400
 401     void setUTF16(UErrorCode &status);
 402     void setUTF8(UErrorCode &status);
 403 };
 404
 405 // Append a UnicodeString to a CharString with UTF-8 encoding.
 406 // Substitute any invalid chars.
 407 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 408 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 409     if (U_FAILURE(status)) {
 410         return;
 411     }
 412     int32_t utf8Length;
 413     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 414                        src.getBuffer(), src.length(),   // UTF-16 data
 415                        0xfffd, NULL,                    // Substitution char, number of subs.
 416                        &status);
 417     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 418         return;
 419     }
 420     status = U_ZERO_ERROR;
 421     int32_t capacity;
 422     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 423     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 424                        src.getBuffer(), src.length(),
 425                        0xfffd, NULL, &status);
 426     dest.append(buffer, utf8Length, status);
 427 }
 428
 429
 430 void TestParams::setUTF16(UErrorCode &status) {
 431     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 432     textMap->removeAllElements();
 433     for (int32_t i=0; i<dataToBreak.length(); i++) {
 434         if (i == dataToBreak.getChar32Start(i)) {
 435             textMap->addElement(i, status);
 436         } else {
 437             textMap->addElement(-1, status);
 438         }
 439     }
 440     textMap->addElement(dataToBreak.length(), status);
 441     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 442 }
 443
 444
 445 void TestParams::setUTF8(UErrorCode &status) {
 446     if (U_FAILURE(status)) {
 447         return;
 448     }
 449     utf8String.clear();
 450     CharStringAppend(utf8String, dataToBreak, status);
 451     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 452     if (U_FAILURE(status)) {
 453         return;
 454     }
 455
 456     textMap->removeAllElements();
 457     int32_t utf16Index = 0;
 458     for (;;) {
 459         textMap->addElement(utf16Index, status);
 460         UChar32 c32 = utext_current32(textToBreak);
 461         if (c32 < 0) {
 462             break;
 463         }
 464         utf16Index += U16_LENGTH(c32);
 465         utext_next32(textToBreak);
 466         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 467             textMap->addElement(-1, status);
 468         }
 469     }
 470     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 471 }
 472
 473
 474 int32_t TestParams::getSrcLine(int32_t bp) {
 475     if (bp >= textMap->size()) {
 476         bp = textMap->size() - 1;
 477     }
 478     int32_t i = 0;
 479     for(; bp >= 0 ; --bp) {
 480         // Move to a character boundary if we are not on one already.
 481         i = textMap->elementAti(bp);
 482         if (i >= 0) {
 483             break;
 484         }
 485     }
 486     return srcLine->elementAti(i);
 487 }
 488
 489
 490 int32_t TestParams::getExpectedBreak(int32_t bp) {
 491     if (bp >= textMap->size()) {
 492         return 0;
 493     }
 494     int32_t i = textMap->elementAti(bp);
 495     int32_t retVal = 0;
 496     if (i >= 0) {
 497         retVal = expectedBreaks->elementAti(i);
 498     }
 499     return retVal;
 500 }
 501
 502
 503 int32_t TestParams::getSrcCol(int32_t bp) {
 504     if (bp >= textMap->size()) {
 505         bp = textMap->size() - 1;
 506     }
 507     int32_t i = 0;
 508     for(; bp >= 0; --bp) {
 509         // Move bp to a character boundary if we are not on one already.
 510         i = textMap->elementAti(bp);
 511         if (i >= 0) {
 512             break;
 513         }
 514     }
 515     return srcCol->elementAti(i);
 516 }
 517
 518
 519 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 520     int32_t    bp;
 521     int32_t    prevBP;
 522     int32_t    i;
 523
 524     TEST_ASSERT_SUCCESS(status);
 525     if (U_FAILURE(status)) {
 526         return;
 527     }
 528
 529     if (t->bi == NULL) {
 530         return;
 531     }
 532
 533     t->bi->setText(t->textToBreak, status);
 534     //
 535     //  Run the iterator forward
 536     //
 537     prevBP = -1;
 538     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 539         if (prevBP ==  bp) {
 540             // Fail for lack of forward progress.
 541             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 542                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 543             break;
 544         }
 545
 546         // Check that there we didn't miss an expected break between the last one
 547         //  and this one.
 548         for (i=prevBP+1; i<bp; i++) {
 549             if (t->getExpectedBreak(i) != 0) {
 550                 int expected[] = {0, i};
 551                 printStringBreaks(t->dataToBreak, expected, 2);
 552                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 553                       i, t->getSrcLine(i), t->getSrcCol(i));
 554             }
 555         }
 556
 557         // Check that the break we did find was expected
 558         if (t->getExpectedBreak(bp) == 0) {
 559             int expected[] = {0, bp};
 560             printStringBreaks(t->textToBreak, expected, 2);
 561             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 562                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 563         } else {
 564             // The break was expected.
 565             //   Check that the {nnn} tag value is correct.
 566             int32_t expectedTagVal = t->getExpectedBreak(bp);
 567             if (expectedTagVal == -1) {
 568                 expectedTagVal = 0;
 569             }
 570             int32_t line = t->getSrcLine(bp);
 571             int32_t rs = t->bi->getRuleStatus();
 572             if (rs != expectedTagVal) {
 573                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 574                       "          Actual, Expected status = %4d, %4d",
 575                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 576             }
 577         }
 578
 579         prevBP = bp;
 580     }
 581
 582     // Verify that there were no missed expected breaks after the last one found
 583     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
 584         if (t->getExpectedBreak(i) != 0) {
 585             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 586                       i, t->getSrcLine(i), t->getSrcCol(i));
 587         }
 588     }
 589
 590     //
 591     //  Run the iterator backwards, verify that the same breaks are found.
 592     //
 593     prevBP = static_cast<int32_t>(utext_nativeLength(t->textToBreak) + 2); // start with a phony value for the last break pos seen.
 594     bp = t->bi->last();
 595     while (bp != BreakIterator::DONE) {
 596         if (prevBP ==  bp) {
 597             // Fail for lack of progress.
 598             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 599                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 600             break;
 601         }
 602
 603         // Check that we didn't miss an expected break between the last one
 604         //  and this one.  (UVector returns zeros for index out of bounds.)
 605         for (i=prevBP-1; i>bp; i--) {
 606             if (t->getExpectedBreak(i) != 0) {
 607                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 608                       i, t->getSrcLine(i), t->getSrcCol(i));
 609             }
 610         }
 611
 612         // Check that the break we did find was expected
 613         if (t->getExpectedBreak(bp) == 0) {
 614             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 615                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
 616         } else {
 617             // The break was expected.
 618             //   Check that the {nnn} tag value is correct.
 619             int32_t expectedTagVal = t->getExpectedBreak(bp);
 620             if (expectedTagVal == -1) {
 621                 expectedTagVal = 0;
 622             }
 623             int line = t->getSrcLine(bp);
 624             int32_t rs = t->bi->getRuleStatus();
 625             if (rs != expectedTagVal) {
 626                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
 627                       "          Actual, Expected status = %4d, %4d",
 628                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 629             }
 630         }
 631
 632         prevBP = bp;
 633         bp = t->bi->previous();
 634     }
 635
 636     // Verify that there were no missed breaks prior to the last one found
 637     for (i=prevBP-1; i>=0; i--) {
 638         if (t->getExpectedBreak(i) != 0) {
 639             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 640                       i, t->getSrcLine(i), t->getSrcCol(i));
 641         }
 642     }
 643
 644     // Check isBoundary()
 645     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
 646         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
 647         UBool boundaryFound    = t->bi->isBoundary(i);
 648         if (boundaryExpected != boundaryFound) {
 649             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
 650                   "        Expected, Actual= %s, %s",
 651                   i, t->getSrcLine(i), t->getSrcCol(i),
 652                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
 653         }
 654     }
 655
 656     // Check following()
 657     for (i=0; i < static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i++) {
 658         int32_t actualBreak = t->bi->following(i);
 659         int32_t expectedBreak = BreakIterator::DONE;
 660         for (int32_t j=i+1; j <= static_cast<int32_t>(utext_nativeLength(t->textToBreak)); j++) {
 661             if (t->getExpectedBreak(j) != 0) {
 662                 expectedBreak = j;
 663                 break;
 664             }
 665         }
 666         if (expectedBreak != actualBreak) {
 667             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
 668                   "        Expected, Actual= %d, %d",
 669                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 670         }
 671     }
 672
 673     // Check preceding()
 674     for (i=static_cast<int32_t>(utext_nativeLength(t->textToBreak)); i>=0; i--) {
 675         int32_t actualBreak = t->bi->preceding(i);
 676         int32_t expectedBreak = BreakIterator::DONE;
 677
 678         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
 679         // preceding(trailing byte) will return the index of some preceding code point,
 680         // not the lead byte of the current code point, even though that has a smaller index.
 681         // Therefore, start looking at the expected break data not at i-1, but at
 682         // the start of code point index - 1.
 683         utext_setNativeIndex(t->textToBreak, i);
 684         int32_t j = static_cast<int32_t>(utext_getNativeIndex(t->textToBreak) - 1);
 685         for (; j >= 0; j--) {
 686             if (t->getExpectedBreak(j) != 0) {
 687                 expectedBreak = j;
 688                 break;
 689             }
 690         }
 691         if (expectedBreak != actualBreak) {
 692             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
 693                   "        Expected, Actual= %d, %d",
 694                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 695         }
 696     }
 697 }
 698
 699
 700 void RBBITest::TestExtended() {
 701   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
 702   // data driven test closely entangles filtered and regular data.
 703 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
 704     UErrorCode      status  = U_ZERO_ERROR;
 705     Locale          locale("");
 706
 707     TestParams          tp(status);
 708
 709     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
 710     if (U_FAILURE(status)) {
 711         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
 712     }
 713
 714     //
 715     //  Open and read the test data file.
 716     //
 717     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 718     CharString testFileName(testDataDirectory, -1, status);
 719     testFileName.append("rbbitst.txt", -1, status);
 720
 721     int    len;
 722     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
 723     if (U_FAILURE(status)) {
 724         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
 725         return;
 726     }
 727
 728     bool skipTest = false; // Skip this test?
 729
 730     //
 731     //  Put the test data into a UnicodeString
 732     //
 733     UnicodeString testString(FALSE, testFile, len);
 734
 735     enum EParseState{
 736         PARSE_COMMENT,
 737         PARSE_TAG,
 738         PARSE_DATA,
 739         PARSE_NUM,
 740         PARSE_RULES
 741     }
 742     parseState = PARSE_TAG;
 743
 744     EParseState savedState = PARSE_TAG;
 745
 746     int32_t    lineNum  = 1;
 747     int32_t    colStart = 0;
 748     int32_t    column   = 0;
 749     int32_t    charIdx  = 0;
 750
 751     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
 752
 753     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
 754     int32_t             rulesFirstLine = 0;  // Line number of the start of current <rules> block
 755
 756     // <rdar://problem/51193810>
 757     mach_timebase_info_data_t info;
 758     uint64_t start, durationOpen = 0.0, durationUse = 0.0;
 759     mach_timebase_info(&info);
 760     UBool isLine = FALSE;
 761
 762     for (charIdx = 0; charIdx < len; ) {
 763         status = U_ZERO_ERROR;
 764         UChar  c = testString.charAt(charIdx);
 765         charIdx++;
 766         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
 767             // treat CRLF as a unit
 768             c = u'\n';
 769             charIdx++;
 770         }
 771         if (c == u'\n' || c == u'\r') {
 772             lineNum++;
 773             colStart = charIdx;
 774         }
 775         column = charIdx - colStart + 1;
 776
 777         switch (parseState) {
 778         case PARSE_COMMENT:
 779             if (c == u'\n' || c == u'\r') {
 780                 parseState = savedState;
 781             }
 782             break;
 783
 784         case PARSE_TAG:
 785             {
 786             if (c == u'#') {
 787                 parseState = PARSE_COMMENT;
 788                 savedState = PARSE_TAG;
 789                 break;
 790             }
 791             if (u_isUWhiteSpace(c)) {
 792                 break;
 793             }
 794             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
 795                 delete tp.bi;
 796                 tp.bi = BreakIterator::createWordInstance(locale,  status);
 797                 skipTest = false;
 798                 charIdx += 5;
 799                 isLine = FALSE;
 800                 break;
 801             }
 802             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
 803                 delete tp.bi;
 804                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
 805                 skipTest = false;
 806                 charIdx += 5;
 807                 isLine = FALSE;
 808                 break;
 809             }
 810             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
 811                 delete tp.bi;
 812                 start = mach_absolute_time(); // <rdar://problem/51193810>
 813                 tp.bi = BreakIterator::createLineInstance(locale,  status);
 814                 durationOpen += (((mach_absolute_time() - start) * info.numer)/info.denom);
 815                 skipTest = false;
 816                 charIdx += 5;
 817                 isLine = TRUE;
 818                 break;
 819             }
 820             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
 821                 delete tp.bi;
 822                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
 823                 skipTest = false;
 824                 charIdx += 5;
 825                 break;
 826             }
 827             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
 828                 delete tp.bi;
 829                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
 830                 charIdx += 6;
 831                 isLine = FALSE;
 832                 break;
 833             }
 834
 835             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
 836                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
 837                 charIdx = testString.indexOf(u'>', charIdx) + 1;
 838                 parseState = PARSE_RULES;
 839                 rules.remove();
 840                 rulesFirstLine = lineNum;
 841                 isLine = FALSE;
 842                 break;
 843             }
 844
 845             // <locale  loc_name>
 846             localeMatcher.reset(testString);
 847             if (localeMatcher.lookingAt(charIdx-1, status)) {
 848                 UnicodeString localeName = localeMatcher.group(1, status);
 849                 char localeName8[100];
 850                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
 851                 locale = Locale::createFromName(localeName8);
 852                 charIdx += localeMatcher.group(0, status).length() - 1;
 853                 TEST_ASSERT_SUCCESS(status);
 854                 break;
 855             }
 856             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
 857                 parseState = PARSE_DATA;
 858                 charIdx += 5;
 859                 tp.dataToBreak = "";
 860                 tp.expectedBreaks->removeAllElements();
 861                 tp.srcCol ->removeAllElements();
 862                 tp.srcLine->removeAllElements();
 863                 break;
 864             }
 865
 866             errln("line %d: Tag expected in test file.", lineNum);
 867             parseState = PARSE_COMMENT;
 868             savedState = PARSE_DATA;
 869             goto end_test; // Stop the test.
 870             }
 871             break;
 872
 873         case PARSE_RULES:
 874             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
 875                 charIdx += 7;
 876                 parseState = PARSE_TAG;
 877                 delete tp.bi;
 878                 UParseError pe;
 879                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
 880                 skipTest = U_FAILURE(status);
 881                 if (U_FAILURE(status)) {
 882                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
 883                         rulesFirstLine + pe.line - 1, u_errorName(status));
 884                 }
 885             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
 886                 charIdx += 10;
 887                 parseState = PARSE_TAG;
 888                 UErrorCode ec = U_ZERO_ERROR;
 889                 UParseError pe;
 890                 RuleBasedBreakIterator bi(rules, pe, ec);
 891                 if (U_SUCCESS(ec)) {
 892                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
 893                         rulesFirstLine + pe.line - 1);
 894                 }
 895             } else {
 896                 rules.append(c);
 897             }
 898             break;
 899
 900         case PARSE_DATA:
 901             if (c == u'\u2022') { // u'•'
 902                 int32_t  breakIdx = tp.dataToBreak.length();
 903                 tp.expectedBreaks->setSize(breakIdx+1);
 904                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 905                 tp.srcLine->setSize(breakIdx+1);
 906                 tp.srcLine->setElementAt(lineNum, breakIdx);
 907                 tp.srcCol ->setSize(breakIdx+1);
 908                 tp.srcCol ->setElementAt(column, breakIdx);
 909                 break;
 910             }
 911
 912             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
 913                 // Add final entry to mappings from break location to source file position.
 914                 //  Need one extra because last break position returned is after the
 915                 //    last char in the data, not at the last char.
 916                 tp.srcLine->addElement(lineNum, status);
 917                 tp.srcCol ->addElement(column, status);
 918
 919                 parseState = PARSE_TAG;
 920                 charIdx += 6;
 921
 922                 if (!skipTest) {
 923                     // RUN THE TEST!
 924                     status = U_ZERO_ERROR;
 925                     tp.setUTF16(status);
 926                     start = mach_absolute_time(); // <rdar://problem/51193810>
 927                     executeTest(&tp, status);
 928                     if (isLine) {
 929                         durationUse += (((mach_absolute_time() - start) * info.numer)/info.denom);
 930                     }
 931                     TEST_ASSERT_SUCCESS(status);
 932
 933                     // Run again, this time with UTF-8 text wrapped in a UText.
 934                     status = U_ZERO_ERROR;
 935                     tp.setUTF8(status);
 936                     TEST_ASSERT_SUCCESS(status);
 937                     executeTest(&tp, status);
 938                 }
 939                 break;
 940             }
 941
 942             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
 943                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
 944                 // Get the code point from the name and insert it into the test data.
 945                 //   (Damn, no API takes names in Unicode  !!!
 946                 //    we've got to take it back to char *)
 947                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
 948                 int32_t nameLength = nameEndIdx - (charIdx+2);
 949                 char charNameBuf[200];
 950                 UChar32 theChar = -1;
 951                 if (nameEndIdx != -1) {
 952                     UErrorCode status = U_ZERO_ERROR;
 953                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
 954                     charNameBuf[sizeof(charNameBuf)-1] = 0;
 955                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
 956                     if (U_FAILURE(status)) {
 957                         theChar = -1;
 958                     }
 959                 }
 960                 if (theChar == -1) {
 961                     errln("Error in named character in test file at line %d, col %d",
 962                         lineNum, column);
 963                 } else {
 964                     // Named code point was recognized.  Insert it
 965                     //   into the test data.
 966                     tp.dataToBreak.append(theChar);
 967                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
 968                         tp.srcLine->addElement(lineNum, status);
 969                         tp.srcCol ->addElement(column, status);
 970                     }
 971                 }
 972                 if (nameEndIdx > charIdx) {
 973                     charIdx = nameEndIdx+1;
 974
 975                 }
 976                 break;
 977             }
 978
 979
 980
 981             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
 982                 charIdx++;
 983                 int32_t  breakIdx = tp.dataToBreak.length();
 984                 tp.expectedBreaks->setSize(breakIdx+1);
 985                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 986                 tp.srcLine->setSize(breakIdx+1);
 987                 tp.srcLine->setElementAt(lineNum, breakIdx);
 988                 tp.srcCol ->setSize(breakIdx+1);
 989                 tp.srcCol ->setElementAt(column, breakIdx);
 990                 break;
 991             }
 992
 993             if (c == u'<') {
 994                 tagValue   = 0;
 995                 parseState = PARSE_NUM;
 996                 break;
 997             }
 998
 999             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1000                 parseState = PARSE_COMMENT;
1001                 savedState = PARSE_DATA;
1002                 break;
1003             }
1004
1005             if (c == u'\\') {
1006                 // Check for \ at end of line, a line continuation.
1007                 //     Advance over (discard) the newline
1008                 UChar32 cp = testString.char32At(charIdx);
1009                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1010                     // We have a CR LF
1011                     //  Need an extra increment of the input ptr to move over both of them
1012                     charIdx++;
1013                 }
1014                 if (cp == u'\n' || cp == u'\r') {
1015                     lineNum++;
1016                     colStart = charIdx;
1017                     charIdx++;
1018                     break;
1019                 }
1020
1021                 // Let unescape handle the back slash.
1022                 cp = testString.unescapeAt(charIdx);
1023                 if (cp != -1) {
1024                     // Escape sequence was recognized.  Insert the char
1025                     //   into the test data.
1026                     tp.dataToBreak.append(cp);
1027                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1028                         tp.srcLine->addElement(lineNum, status);
1029                         tp.srcCol ->addElement(column, status);
1030                     }
1031                     break;
1032                 }
1033
1034
1035                 // Not a recognized backslash escape sequence.
1036                 // Take the next char as a literal.
1037                 //  TODO:  Should this be an error?
1038                 c = testString.charAt(charIdx);
1039                 charIdx = testString.moveIndex32(charIdx, 1);
1040             }
1041
1042             // Normal, non-escaped data char.
1043             tp.dataToBreak.append(c);
1044
1045             // Save the mapping from offset in the data to line/column numbers in
1046             //   the original input file.  Will be used for better error messages only.
1047             //   If there's an expected break before this char, the slot in the mapping
1048             //     vector will already be set for this char; don't overwrite it.
1049             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1050                 tp.srcLine->addElement(lineNum, status);
1051                 tp.srcCol ->addElement(column, status);
1052             }
1053             break;
1054
1055
1056         case PARSE_NUM:
1057             // We are parsing an expected numeric tag value, like <1234>,
1058             //   within a chunk of data.
1059             if (u_isUWhiteSpace(c)) {
1060                 break;
1061             }
1062
1063             if (c == u'>') {
1064                 // Finished the number.  Add the info to the expected break data,
1065                 //   and switch parse state back to doing plain data.
1066                 parseState = PARSE_DATA;
1067                 if (tagValue == 0) {
1068                     tagValue = -1;
1069                 }
1070                 int32_t  breakIdx = tp.dataToBreak.length();
1071                 tp.expectedBreaks->setSize(breakIdx+1);
1072                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1073                 tp.srcLine->setSize(breakIdx+1);
1074                 tp.srcLine->setElementAt(lineNum, breakIdx);
1075                 tp.srcCol ->setSize(breakIdx+1);
1076                 tp.srcCol ->setElementAt(column, breakIdx);
1077                 break;
1078             }
1079
1080             if (u_isdigit(c)) {
1081                 tagValue = tagValue*10 + u_charDigitValue(c);
1082                 break;
1083             }
1084
1085             errln("Syntax Error in test file at line %d, col %d",
1086                 lineNum, column);
1087             parseState = PARSE_COMMENT;
1088             goto end_test; // Stop the test
1089             break;
1090         }
1091
1092
1093         if (U_FAILURE(status)) {
1094             dataerrln("ICU Error %s while parsing test file at line %d.",
1095                 u_errorName(status), lineNum);
1096             status = U_ZERO_ERROR;
1097             goto end_test; // Stop the test
1098         }
1099
1100     }
1101
1102     // Reached end of test file. Raise an error if parseState indicates that we are
1103     //   within a block that should have been terminated.
1104
1105     if (parseState == PARSE_RULES) {
1106         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1107             lineNum, rulesFirstLine);
1108     }
1109     if (parseState == PARSE_DATA) {
1110         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1111     }
1112
1113     //
1114     infoln("TestExtended total time in createLineInstance     (nsec):\t%llu\n", durationOpen);
1115     infoln("TestExtended total time in linebreak test execute (nsec):\t%llu\n", durationUse);
1116
1117
1118 end_test:
1119     delete [] testFile;
1120 #endif
1121 }
1122
1123
1124 //-------------------------------------------------------------------------------
1125 //
1126 //  TestDictRules   create a break iterator from source rules that includes a
1127 //                  dictionary range.   Regression for bug #7130.  Source rules
1128 //                  do not declare a break iterator type (word, line, sentence, etc.
1129 //                  but the dictionary code, without a type, would loop.
1130 //
1131 //-------------------------------------------------------------------------------
1132 void RBBITest::TestDictRules() {
1133     const char *rules =  "$dictionary = [a-z]; \n"
1134                          "!!forward; \n"
1135                          "$dictionary $dictionary; \n"
1136                          "!!reverse; \n"
1137                          "$dictionary $dictionary; \n";
1138     const char *text = "aa";
1139     UErrorCode status = U_ZERO_ERROR;
1140     UParseError parseError;
1141
1142     RuleBasedBreakIterator bi(rules, parseError, status);
1143     if (U_SUCCESS(status)) {
1144         UnicodeString utext = text;
1145         bi.setText(utext);
1146         int32_t position;
1147         int32_t loops;
1148         for (loops = 0; loops<10; loops++) {
1149             position = bi.next();
1150             if (position == RuleBasedBreakIterator::DONE) {
1151                 break;
1152             }
1153         }
1154         TEST_ASSERT(loops == 1);
1155     } else {
1156         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1157     }
1158 }
1159
1160
1161
1162 //-------------------------------------------------------------------------------
1163 //
1164 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1165 //    return the data in one big UChar * buffer, which the caller must delete.
1166 //
1167 //    parameters:
1168 //          fileName:   the name of the file, with no directory part.  The test data directory
1169 //                      is assumed.
1170 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1171 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1172 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1173 //                      Pass NULL for the system default encoding.
1174 //          status
1175 //    returns:
1176 //                      The file data, converted to UChar.
1177 //                      The caller must delete this when done with
1178 //                           delete [] theBuffer;
1179 //
1180 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1181 //           Move this function to some common place.
1182 //
1183 //--------------------------------------------------------------------------------
1184 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1185     UChar       *retPtr  = NULL;
1186     char        *fileBuf = NULL;
1187     UConverter* conv     = NULL;
1188     FILE        *f       = NULL;
1189
1190     ulen = 0;
1191     if (U_FAILURE(status)) {
1192         return retPtr;
1193     }
1194
1195     //
1196     //  Open the file.
1197     //
1198     f = fopen(fileName, "rb");
1199     if (f == 0) {
1200         dataerrln("Error opening test data file %s\n", fileName);
1201         status = U_FILE_ACCESS_ERROR;
1202         return NULL;
1203     }
1204     //
1205     //  Read it in
1206     //
1207     int   fileSize;
1208     int   amt_read;
1209
1210     fseek( f, 0, SEEK_END);
1211     fileSize = ftell(f);
1212     fileBuf = new char[fileSize];
1213     fseek(f, 0, SEEK_SET);
1214     amt_read = static_cast<int>(fread(fileBuf, 1, fileSize, f));
1215     if (amt_read != fileSize || fileSize <= 0) {
1216         errln("Error reading test data file.");
1217         goto cleanUpAndReturn;
1218     }
1219
1220     //
1221     // Look for a Unicode Signature (BOM) on the data just read
1222     //
1223     int32_t        signatureLength;
1224     const char *   fileBufC;
1225     const char*    bomEncoding;
1226
1227     fileBufC = fileBuf;
1228     bomEncoding = ucnv_detectUnicodeSignature(
1229         fileBuf, fileSize, &signatureLength, &status);
1230     if(bomEncoding!=NULL ){
1231         fileBufC  += signatureLength;
1232         fileSize  -= signatureLength;
1233         encoding = bomEncoding;
1234     }
1235
1236     //
1237     // Open a converter to take the rule file to UTF-16
1238     //
1239     conv = ucnv_open(encoding, &status);
1240     if (U_FAILURE(status)) {
1241         goto cleanUpAndReturn;
1242     }
1243
1244     //
1245     // Convert the rules to UChar.
1246     //  Preflight first to determine required buffer size.
1247     //
1248     ulen = ucnv_toUChars(conv,
1249         NULL,           //  dest,
1250         0,              //  destCapacity,
1251         fileBufC,
1252         fileSize,
1253         &status);
1254     if (status == U_BUFFER_OVERFLOW_ERROR) {
1255         // Buffer Overflow is expected from the preflight operation.
1256         status = U_ZERO_ERROR;
1257
1258         retPtr = new UChar[ulen+1];
1259         ucnv_toUChars(conv,
1260             retPtr,       //  dest,
1261             ulen+1,
1262             fileBufC,
1263             fileSize,
1264             &status);
1265     }
1266
1267 cleanUpAndReturn:
1268     fclose(f);
1269     delete []fileBuf;
1270     ucnv_close(conv);
1271     if (U_FAILURE(status)) {
1272         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1273         delete []retPtr;
1274         retPtr = 0;
1275         ulen   = 0;
1276     }
1277     return retPtr;
1278 }
1279
1280
1281
1282 //--------------------------------------------------------------------------------------------
1283 //
1284 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1285 //
1286 //-------------------------------------------------------------------------------------------
1287 void RBBITest::TestUnicodeFiles() {
1288     RuleBasedBreakIterator  *bi;
1289     UErrorCode               status = U_ZERO_ERROR;
1290
1291     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1292     TEST_ASSERT_SUCCESS(status);
1293     if (U_SUCCESS(status)) {
1294         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1295     }
1296     delete bi;
1297
1298     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1299     TEST_ASSERT_SUCCESS(status);
1300     if (U_SUCCESS(status)) {
1301         runUnicodeTestData("WordBreakTest.txt", bi);
1302     }
1303     delete bi;
1304
1305     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1306     TEST_ASSERT_SUCCESS(status);
1307     if (U_SUCCESS(status)) {
1308         runUnicodeTestData("SentenceBreakTest.txt", bi);
1309     }
1310     delete bi;
1311
1312     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1313     TEST_ASSERT_SUCCESS(status);
1314     if (U_SUCCESS(status)) {
1315         runUnicodeTestData("LineBreakTest.txt", bi);
1316     }
1317     delete bi;
1318 }
1319
1320
1321 // Check for test cases from the Unicode test data files that are known to fail
1322 // and should be skipped as known issues because ICU does not fully implement
1323 // the Unicode specifications, or because ICU includes tailorings that differ from
1324 // the Unicode standard.
1325 //
1326 // Test cases are identified by the test data sequence, which tends to be more stable
1327 // across Unicode versions than the test file line numbers.
1328 //
1329 // The test case with ticket "10666" is a dummy, included as an example.
1330
1331 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1332     static struct TestCase {
1333         const char *fTicketNum;
1334         const char *fFileName;
1335         const UChar *fString;
1336     } badTestCases[] = {
1337         {"10666", "GraphemeBreakTest.txt", u"\u0020\u0020\u0033"},    // Fake example, for illustration.
1338         // Issue 8151, move the Finnish tailoring of the line break of hyphens to root.
1339         // This probably ultimately wants to be resolved by updating UAX-14, but in the mean time
1340         // ICU is out of sync with Unicode.
1341         {"8151",  "LineBreakTest.txt", u"-#"},
1342         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0023"},
1343         {"8151",  "LineBreakTest.txt", u"\u002d\u00a7"},
1344         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u00a7"},
1345         {"8151",  "LineBreakTest.txt", u"\u002d\U00050005"},
1346         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\U00050005"},
1347         {"8151",  "LineBreakTest.txt", u"\u002d\u0e01"},
1348         {"8151",  "LineBreakTest.txt", u"\u002d\u0308\u0e01"},
1349
1350         // Issue ICU-12017 Improve line break around numbers
1351         {"12017", "LineBreakTest.txt", u"\u002C\u0030"},   // ",0"
1352         {"12017", "LineBreakTest.txt", u"\u002C\u0308\u0030"},
1353         {"12017", "LineBreakTest.txt", u"find .com"},
1354         {"12017", "LineBreakTest.txt", u"equals .35 cents"},
1355         {"12017", "LineBreakTest.txt", u"a.2 "},
1356         {"12017", "LineBreakTest.txt", u"a.2 \u0915"},
1357         {"12017", "LineBreakTest.txt", u"a.2 \u672C"},
1358         {"12017", "LineBreakTest.txt", u"a.2\u3000\u672C"},
1359         {"12017", "LineBreakTest.txt", u"a.2\u3000\u307E"},
1360         {"12017", "LineBreakTest.txt", u"a.2\u3000\u0033"},
1361         {"12017", "LineBreakTest.txt", u"A.1 \uBABB"},
1362         {"12017", "LineBreakTest.txt", u"\uBD24\uC5B4\u002E\u0020\u0041\u002E\u0032\u0020\uBCFC"},
1363         {"12017", "LineBreakTest.txt", u"\uBD10\uC694\u002E\u0020\u0041\u002E\u0033\u0020\uBABB"},
1364         {"12017", "LineBreakTest.txt", u"\uC694\u002E\u0020\u0041\u002E\u0034\u0020\uBABB"},
1365         {"12017", "LineBreakTest.txt", u"a.2\u3000\u300C"},
1366     };
1367
1368     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1369         const TestCase &badCase = badTestCases[n];
1370         if (!strcmp(fileName, badCase.fFileName) &&
1371                 testCase == UnicodeString(badCase.fString)) {
1372             return logKnownIssue(badCase.fTicketNum);
1373         }
1374     }
1375     return FALSE;
1376 }
1377
1378
1379 //--------------------------------------------------------------------------------------------
1380 //
1381 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1382 //
1383 //-------------------------------------------------------------------------------------------
1384 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1386     UErrorCode  status = U_ZERO_ERROR;
1387
1388     //
1389     //  Open and read the test data file, put it into a UnicodeString.
1390     //
1391     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1392     char testFileName[1000];
1393     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1394         dataerrln("Can't open test data.  Path too long.");
1395         return;
1396     }
1397     strcpy(testFileName, testDataDirectory);
1398     strcat(testFileName, fileName);
1399
1400     logln("Opening data file %s\n", fileName);
1401
1402     int    len;
1403     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1404     if (status != U_FILE_ACCESS_ERROR) {
1405         TEST_ASSERT_SUCCESS(status);
1406         TEST_ASSERT(testFile != NULL);
1407     }
1408     if (U_FAILURE(status) || testFile == NULL) {
1409         return; /* something went wrong, error already output */
1410     }
1411     UnicodeString testFileAsString(TRUE, testFile, len);
1412
1413     //
1414     //  Parse the test data file using a regular expression.
1415     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1416     //     is identified by which group had a match.
1417     //
1418     //    Caputure Group #                  1          2            3            4           5
1419     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1420     //
1421     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1422     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1423     UnicodeString   testString;
1424     UVector32       breakPositions(status);
1425     int             lineNumber = 1;
1426     TEST_ASSERT_SUCCESS(status);
1427     if (U_FAILURE(status)) {
1428         return;
1429     }
1430
1431     //
1432     //  Scan through each test case, building up the string to be broken in testString,
1433     //   and the positions that should be boundaries in the breakPositions vector.
1434     //
1435     int spin = 0;
1436     while (tokenMatcher.find()) {
1437         if(tokenMatcher.hitEnd()) {
1438           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1439              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1440              and caused an infinite loop here on EBCDIC systems!
1441           */
1442           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1443           //       return;
1444         }
1445         if (tokenMatcher.start(1, status) >= 0) {
1446             // Scanned a divide sign, indicating a break position in the test data.
1447             if (testString.length()>0) {
1448                 breakPositions.addElement(testString.length(), status);
1449             }
1450         }
1451         else if (tokenMatcher.start(2, status) >= 0) {
1452             // Scanned an 'x', meaning no break at this position in the test data
1453             //   Nothing to be done here.
1454             }
1455         else if (tokenMatcher.start(3, status) >= 0) {
1456             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1457             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1458             int length = hexNumber.length();
1459             if (length<=8) {
1460                 char buf[10];
1461                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1462                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1463                 if (c<=0x10ffff) {
1464                     testString.append(c);
1465                 } else {
1466                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1467                        fileName, lineNumber);
1468                 }
1469             } else {
1470                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1471                        fileName, lineNumber);
1472              }
1473         }
1474         else if (tokenMatcher.start(4, status) >= 0) {
1475             // Scanned to end of a line, possibly skipping over a comment in the process.
1476             //   If the line from the file contained test data, run the test now.
1477             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1478                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1479             }
1480
1481             // Clear out this test case.
1482             //    The string and breakPositions vector will be refilled as the next
1483             //       test case is parsed.
1484             testString.remove();
1485             breakPositions.removeAllElements();
1486             lineNumber++;
1487         } else {
1488             // Scanner catchall.  Something unrecognized appeared on the line.
1489             char token[16];
1490             UnicodeString uToken = tokenMatcher.group(0, status);
1491             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1492             token[sizeof(token)-1] = 0;
1493             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1494
1495             // Clean up, in preparation for continuing with the next line.
1496             testString.remove();
1497             breakPositions.removeAllElements();
1498             lineNumber++;
1499         }
1500         TEST_ASSERT_SUCCESS(status);
1501         if (U_FAILURE(status)) {
1502             break;
1503         }
1504     }
1505
1506     delete [] testFile;
1507  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1508 }
1509
1510 //--------------------------------------------------------------------------------------------
1511 //
1512 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1513 //                            test data files.  Do only a simple, forward-only check -
1514 //                            this test is mostly to check that ICU and the Unicode
1515 //                            data agree with each other.
1516 //
1517 //--------------------------------------------------------------------------------------------
1518 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1519                          const UnicodeString &testString,   // Text data to be broken
1520                          UVector32 *breakPositions,         // Positions where breaks should be found.
1521                          RuleBasedBreakIterator *bi) {
1522     int32_t pos;                 // Break Position in the test string
1523     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1524     int32_t expectedPos;         // Expected break position (index into test string)
1525
1526     bi->setText(testString);
1527     pos = bi->first();
1528     pos = bi->next();
1529
1530     while (pos != BreakIterator::DONE) {
1531         if (expectedI >= breakPositions->size()) {
1532             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1533                 testFileName, lineNumber, pos);
1534             break;
1535         }
1536         expectedPos = breakPositions->elementAti(expectedI);
1537         if (pos < expectedPos) {
1538             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1539                 testFileName, lineNumber, pos);
1540             break;
1541         }
1542         if (pos > expectedPos) {
1543             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1544                 testFileName, lineNumber, expectedPos);
1545             break;
1546         }
1547         pos = bi->next();
1548         expectedI++;
1549     }
1550
1551     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1552         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1553             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1554     }
1555 }
1556
1557
1558
1559 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1560 //---------------------------------------------------------------------------------------
1561 //
1562 //   classs RBBIMonkeyKind
1563 //
1564 //      Monkey Test for Break Iteration
1565 //      Abstract interface class.   Concrete derived classes independently
1566 //      implement the break rules for different iterator types.
1567 //
1568 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1569 //      testing, but works purely in terms of the interface defined here.
1570 //
1571 //---------------------------------------------------------------------------------------
1572 class RBBIMonkeyKind {
1573 public:
1574     // Return a UVector of UnicodeSets, representing the character classes used
1575     //   for this type of iterator.
1576     virtual  UVector  *charClasses() = 0;
1577
1578     // Set the test text on which subsequent calls to next() will operate
1579     virtual  void      setText(const UnicodeString &s) = 0;
1580
1581     // Find the next break postion, starting from the prev break position, or from zero.
1582     // Return -1 after reaching end of string.
1583     virtual  int32_t   next(int32_t i) = 0;
1584
1585     // Name of each character class, parallel with charClasses. Used for debugging output
1586     // of characters.
1587     virtual  std::vector<std::string>&     characterClassNames();
1588
1589     void setAppliedRule(int32_t position, const char* value);
1590
1591     std::string getAppliedRule(int32_t position);
1592
1593     virtual ~RBBIMonkeyKind();
1594     UErrorCode deferredStatus;
1595
1596     std::string classNameFromCodepoint(const UChar32 c);
1597     unsigned int maxClassNameSize();
1598
1599  protected:
1600      RBBIMonkeyKind();
1601      std::vector<std::string> classNames;
1602      std::vector<std::string> appliedRules;
1603
1604     // Clear `appliedRules` and fill it with empty strings in the size of test text.
1605     void prepareAppliedRules(int32_t size );
1606
1607  private:
1608
1609 };
1610
1611 RBBIMonkeyKind::RBBIMonkeyKind() {
1612     deferredStatus = U_ZERO_ERROR;
1613 }
1614
1615 RBBIMonkeyKind::~RBBIMonkeyKind() {
1616 }
1617
1618 std::vector<std::string>& RBBIMonkeyKind::characterClassNames() {
1619     return classNames;
1620 }
1621
1622 void RBBIMonkeyKind::prepareAppliedRules(int32_t size) {
1623     // Remove all the information in the `appliedRules`.
1624     appliedRules.clear();
1625     appliedRules.resize(size + 1);
1626 }
1627
1628 void RBBIMonkeyKind::setAppliedRule(int32_t position, const char* value) {
1629     appliedRules[position] = value;
1630 }
1631
1632 std::string RBBIMonkeyKind::getAppliedRule(int32_t position){
1633     return appliedRules[position];
1634 }
1635
1636 std::string RBBIMonkeyKind::classNameFromCodepoint(const UChar32 c) {
1637     // Simply iterate through charClasses to find character's class
1638     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1639         UnicodeSet *classSet = (UnicodeSet *)charClasses()->elementAt(aClassNum);
1640         if (classSet->contains(c)) {
1641             return classNames[aClassNum];
1642         }
1643     }
1644     U_ASSERT(FALSE);  // This should not happen.
1645     return "bad class name";
1646 }
1647
1648 unsigned int RBBIMonkeyKind::maxClassNameSize() {
1649     unsigned int maxSize = 0;
1650     for (int aClassNum = 0; aClassNum < charClasses()->size(); aClassNum++) {
1651         if (classNames[aClassNum].size() > maxSize) {
1652             maxSize = classNames[aClassNum].size();
1653         }
1654     }
1655     return maxSize;
1656 }
1657
1658 //----------------------------------------------------------------------------------------
1659 //
1660 //   Random Numbers.  Similar to standard lib rand() and srand()
1661 //                    Not using library to
1662 //                      1.  Get same results on all platforms.
1663 //                      2.  Get access to current seed, to more easily reproduce failures.
1664 //
1665 //---------------------------------------------------------------------------------------
1666 static uint32_t m_seed = 1;
1667
1668 static uint32_t m_rand()
1669 {
1670     m_seed = m_seed * 1103515245 + 12345;
1671     return (uint32_t)(m_seed/65536) % 32768;
1672 }
1673
1674
1675 //------------------------------------------------------------------------------------------
1676 //
1677 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1678 //                             of RBBIMonkeyKind.
1679 //
1680 //------------------------------------------------------------------------------------------
1681 class RBBICharMonkey: public RBBIMonkeyKind {
1682 public:
1683     RBBICharMonkey();
1684     virtual          ~RBBICharMonkey();
1685     virtual  UVector *charClasses();
1686     virtual  void     setText(const UnicodeString &s);
1687     virtual  int32_t  next(int32_t i);
1688 private:
1689     UVector   *fSets;
1690
1691     UnicodeSet  *fCRLFSet;
1692     UnicodeSet  *fControlSet;
1693     UnicodeSet  *fExtendSet;
1694     UnicodeSet  *fZWJSet;
1695     UnicodeSet  *fRegionalIndicatorSet;
1696     UnicodeSet  *fPrependSet;
1697     UnicodeSet  *fSpacingSet;
1698     UnicodeSet  *fLSet;
1699     UnicodeSet  *fVSet;
1700     UnicodeSet  *fTSet;
1701     UnicodeSet  *fLVSet;
1702     UnicodeSet  *fLVTSet;
1703     UnicodeSet  *fHangulSet;
1704     UnicodeSet  *fExtendedPictSet;
1705     UnicodeSet  *fViramaSet;
1706     UnicodeSet  *fLinkingConsonantSet;
1707     UnicodeSet  *fExtCccZwjSet;
1708     UnicodeSet  *fAnySet;
1709
1710     const UnicodeString *fText;
1711 };
1712
1713
1714 RBBICharMonkey::RBBICharMonkey() {
1715     UErrorCode  status = U_ZERO_ERROR;
1716
1717     fText = NULL;
1718
1719     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1720     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1721     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1722     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1723     fRegionalIndicatorSet =
1724                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1725     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1726     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1727     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1728     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1729     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1730     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1731     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1732     fHangulSet  = new UnicodeSet();
1733     fHangulSet->addAll(*fLSet);
1734     fHangulSet->addAll(*fVSet);
1735     fHangulSet->addAll(*fTSet);
1736     fHangulSet->addAll(*fLVSet);
1737     fHangulSet->addAll(*fLVTSet);
1738
1739     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1740     fViramaSet        = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1741                                         "\\p{Indic_Syllabic_Category=Virama}]", status);
1742     fLinkingConsonantSet = new UnicodeSet(u"[\\p{Gujr}\\p{sc=Telu}\\p{sc=Mlym}\\p{sc=Orya}\\p{sc=Beng}\\p{sc=Deva}&"
1743                                         "\\p{Indic_Syllabic_Category=Consonant}]", status);
1744     fExtCccZwjSet     = new UnicodeSet(u"[[\\p{gcb=Extend}-\\p{ccc=0}] \\p{gcb=ZWJ}]", status);
1745     fAnySet           = new UnicodeSet(0, 0x10ffff);
1746
1747     // Create sets of characters, and add the names of the above character sets.
1748     // In each new ICU release, add new names corresponding to the sets above.
1749     fSets             = new UVector(status);
1750
1751     // Important: Keep class names the same as the class contents.
1752     fSets->addElement(fCRLFSet, status); classNames.push_back("CRLF");
1753     fSets->addElement(fControlSet, status); classNames.push_back("Control");
1754     fSets->addElement(fExtendSet, status); classNames.push_back("Extended");
1755     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
1756     if (!fPrependSet->isEmpty()) {
1757         fSets->addElement(fPrependSet, status); classNames.push_back("Prepend");
1758     }
1759     fSets->addElement(fSpacingSet, status); classNames.push_back("Spacing");
1760     fSets->addElement(fHangulSet, status); classNames.push_back("Hangul");
1761     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
1762     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
1763     fSets->addElement(fViramaSet, status); classNames.push_back("Virama");
1764     fSets->addElement(fLinkingConsonantSet, status); classNames.push_back("LinkingConsonant");
1765     fSets->addElement(fExtCccZwjSet, status); classNames.push_back("ExtCcccZwj");
1766     fSets->addElement(fAnySet, status); classNames.push_back("Any");
1767
1768     if (U_FAILURE(status)) {
1769         deferredStatus = status;
1770     }
1771 }
1772
1773
1774 void RBBICharMonkey::setText(const UnicodeString &s) {
1775     fText = &s;
1776     prepareAppliedRules(s.length());
1777 }
1778
1779
1780
1781 int32_t RBBICharMonkey::next(int32_t prevPos) {
1782     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1783                               //   break position being tested.  The candidate break
1784                               //   location is before p2.
1785
1786     int     breakPos = -1;
1787
1788     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1789     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1790
1791     if (U_FAILURE(deferredStatus)) {
1792         return -1;
1793     }
1794
1795     // Previous break at end of string.  return DONE.
1796     if (prevPos >= fText->length()) {
1797         return -1;
1798     }
1799
1800     p0 = p1 = p2 = p3 = prevPos;
1801     c3 =  fText->char32At(prevPos);
1802     c0 = c1 = c2 = cBase = 0;
1803     (void)p0;   // suppress set but not used warning.
1804     (void)c0;
1805
1806     // Loop runs once per "significant" character position in the input text.
1807     for (;;) {
1808         // Move all of the positions forward in the input string.
1809         p0 = p1;  c0 = c1;
1810         p1 = p2;  c1 = c2;
1811         p2 = p3;  c2 = c3;
1812
1813         // Advance p3 by one codepoint
1814         p3 = fText->moveIndex32(p3, 1);
1815         c3 = fText->char32At(p3);
1816
1817         if (p1 == p2) {
1818             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1819             continue;
1820         }
1821
1822         if (p2 == fText->length()) {
1823             setAppliedRule(p2, "End of String");
1824             break;
1825         }
1826
1827         //     No Extend or Format characters may appear between the CR and LF,
1828         //     which requires the additional check for p2 immediately following p1.
1829         //
1830         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1831           setAppliedRule(p2, "GB3   CR x LF");
1832           continue;
1833         }
1834
1835         if (fControlSet->contains(c1) ||
1836             c1 == 0x0D ||
1837             c1 == 0x0A)  {
1838           setAppliedRule(p2, "GB4   ( Control | CR | LF ) <break>");
1839           break;
1840         }
1841
1842         if (fControlSet->contains(c2) ||
1843             c2 == 0x0D ||
1844             c2 == 0x0A)  {
1845             setAppliedRule(p2, "GB5   <break>  ( Control | CR | LF )");
1846             break;
1847         }
1848
1849         if (fLSet->contains(c1) &&
1850                (fLSet->contains(c2)  ||
1851                 fVSet->contains(c2)  ||
1852                 fLVSet->contains(c2) ||
1853                 fLVTSet->contains(c2))) {
1854             setAppliedRule(p2, "GB6   L x ( L | V | LV | LVT )");
1855             continue;
1856         }
1857
1858         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1859             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1860             setAppliedRule(p2, "GB7    ( LV | V )  x  ( V | T )");
1861             continue;
1862         }
1863
1864         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1865             fTSet->contains(c2))  {
1866             setAppliedRule(p2, "GB8   ( LVT | T)  x T");
1867             continue;
1868         }
1869
1870         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1871             if (!fExtendSet->contains(c1)) {
1872                 cBase = c1;
1873             }
1874             setAppliedRule(p2, "GB9   x (Extend | ZWJ)");
1875             continue;
1876         }
1877
1878         if (fSpacingSet->contains(c2)) {
1879             setAppliedRule(p2, "GB9a  x  SpacingMark");
1880             continue;
1881         }
1882
1883         if (fPrependSet->contains(c1)) {
1884             setAppliedRule(p2, "GB9b  Prepend x");
1885             continue;
1886         }
1887
1888         //   Note: Viramas are also included in the ExtCccZwj class.
1889         if (fLinkingConsonantSet->contains(c2)) {
1890             int pi = p1;
1891             bool sawVirama = false;
1892             while (pi > 0 && fExtCccZwjSet->contains(fText->char32At(pi))) {
1893                 if (fViramaSet->contains(fText->char32At(pi))) {
1894                     sawVirama = true;
1895                 }
1896                 pi = fText->moveIndex32(pi, -1);
1897             }
1898             if (sawVirama && fLinkingConsonantSet->contains(fText->char32At(pi))) {
1899               setAppliedRule(p2, "GB9.3  LinkingConsonant ExtCccZwj* Virama ExtCccZwj* × LinkingConsonant");
1900               continue;
1901             }
1902         }
1903
1904         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1905           setAppliedRule(p2, "GB11  Extended_Pictographic Extend * ZWJ x Extended_Pictographic");
1906           continue;
1907         }
1908
1909         //                   Note: The first if condition is a little tricky. We only need to force
1910         //                      a break if there are three or more contiguous RIs. If there are
1911         //                      only two, a break following will occur via other rules, and will include
1912         //                      any trailing extend characters, which is needed behavior.
1913         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1914                 && fRegionalIndicatorSet->contains(c2)) {
1915           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1916           break;
1917         }
1918         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1919           setAppliedRule(p2, "GB12-13  Regional_Indicator x Regional_Indicator");
1920           continue;
1921         }
1922
1923         setAppliedRule(p2, "GB999 Any <break> Any");
1924         break;
1925     }
1926
1927     breakPos = p2;
1928     return breakPos;
1929 }
1930
1931
1932
1933 UVector  *RBBICharMonkey::charClasses() {
1934     return fSets;
1935 }
1936
1937 RBBICharMonkey::~RBBICharMonkey() {
1938     delete fSets;
1939     delete fCRLFSet;
1940     delete fControlSet;
1941     delete fExtendSet;
1942     delete fRegionalIndicatorSet;
1943     delete fPrependSet;
1944     delete fSpacingSet;
1945     delete fLSet;
1946     delete fVSet;
1947     delete fTSet;
1948     delete fLVSet;
1949     delete fLVTSet;
1950     delete fHangulSet;
1951     delete fAnySet;
1952     delete fZWJSet;
1953     delete fExtendedPictSet;
1954     delete fViramaSet;
1955     delete fLinkingConsonantSet;
1956     delete fExtCccZwjSet;
1957 }
1958
1959 //------------------------------------------------------------------------------------------
1960 //
1961 //   class RBBIWordMonkey      Word Break specific implementation
1962 //                             of RBBIMonkeyKind.
1963 //
1964 //------------------------------------------------------------------------------------------
1965 class RBBIWordMonkey: public RBBIMonkeyKind {
1966 public:
1967     RBBIWordMonkey();
1968     virtual          ~RBBIWordMonkey();
1969     virtual  UVector *charClasses();
1970     virtual  void     setText(const UnicodeString &s);
1971     virtual int32_t   next(int32_t i);
1972 private:
1973     UVector      *fSets;
1974
1975     UnicodeSet  *fCRSet;
1976     UnicodeSet  *fLFSet;
1977     UnicodeSet  *fNewlineSet;
1978     UnicodeSet  *fRegionalIndicatorSet;
1979     UnicodeSet  *fKatakanaSet;
1980     UnicodeSet  *fHebrew_LetterSet;
1981     UnicodeSet  *fALetterSet;
1982     UnicodeSet  *fSingle_QuoteSet;
1983     UnicodeSet  *fDouble_QuoteSet;
1984     UnicodeSet  *fMidNumLetSet;
1985     UnicodeSet  *fMidLetterSet;
1986     UnicodeSet  *fMidNumSet;
1987     UnicodeSet  *fNumericSet;
1988     UnicodeSet  *fFormatSet;
1989     UnicodeSet  *fOtherSet;
1990     UnicodeSet  *fExtendSet;
1991     UnicodeSet  *fExtendNumLetSet;
1992     UnicodeSet  *fWSegSpaceSet;
1993     UnicodeSet  *fDictionarySet;
1994     UnicodeSet  *fZWJSet;
1995     UnicodeSet  *fExtendedPictSet;
1996
1997     const UnicodeString  *fText;
1998 };
1999
2000
2001 RBBIWordMonkey::RBBIWordMonkey()
2002 {
2003     UErrorCode  status = U_ZERO_ERROR;
2004
2005     fSets            = new UVector(status);
2006
2007     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
2008     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
2009     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
2010     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
2011     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2012     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2013     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2014     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
2015     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
2016     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
2017     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
2018     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
2019     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]", status);
2020     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
2021     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2022     // There are some sc=Hani characters with WB=Extend.
2023     // The break rules need to pick one or the other because
2024     // Extend overlapping with something else is messy.
2025     // For Unicode 13, we chose to keep U+16FF0 & U+16FF1
2026     // in $Han (for $dictionary) and out of $Extend.
2027     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}-[:Hani:]]", status);
2028     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
2029
2030     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
2031     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
2032
2033     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2034     fDictionarySet->addAll(*fKatakanaSet);
2035     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2036
2037     fALetterSet->removeAll(*fDictionarySet);
2038
2039     fOtherSet        = new UnicodeSet();
2040     if(U_FAILURE(status)) {
2041         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2042         deferredStatus = status;
2043         return;
2044     }
2045
2046     fOtherSet->complement();
2047     fOtherSet->removeAll(*fCRSet);
2048     fOtherSet->removeAll(*fLFSet);
2049     fOtherSet->removeAll(*fNewlineSet);
2050     fOtherSet->removeAll(*fKatakanaSet);
2051     fOtherSet->removeAll(*fHebrew_LetterSet);
2052     fOtherSet->removeAll(*fALetterSet);
2053     fOtherSet->removeAll(*fSingle_QuoteSet);
2054     fOtherSet->removeAll(*fDouble_QuoteSet);
2055     fOtherSet->removeAll(*fMidLetterSet);
2056     fOtherSet->removeAll(*fMidNumSet);
2057     fOtherSet->removeAll(*fNumericSet);
2058     fOtherSet->removeAll(*fExtendNumLetSet);
2059     fOtherSet->removeAll(*fWSegSpaceSet);
2060     fOtherSet->removeAll(*fFormatSet);
2061     fOtherSet->removeAll(*fExtendSet);
2062     fOtherSet->removeAll(*fRegionalIndicatorSet);
2063     fOtherSet->removeAll(*fZWJSet);
2064     fOtherSet->removeAll(*fExtendedPictSet);
2065
2066     // Inhibit dictionary characters from being tested at all.
2067     fOtherSet->removeAll(*fDictionarySet);
2068
2069     // Add classes and their names
2070     fSets->addElement(fCRSet, status); classNames.push_back("CR");
2071     fSets->addElement(fLFSet, status); classNames.push_back("LF");
2072     fSets->addElement(fNewlineSet, status); classNames.push_back("Newline");
2073     fSets->addElement(fRegionalIndicatorSet, status); classNames.push_back("RegionalIndicator");
2074     fSets->addElement(fHebrew_LetterSet, status); classNames.push_back("Hebrew");
2075     fSets->addElement(fALetterSet, status); classNames.push_back("ALetter");
2076     fSets->addElement(fSingle_QuoteSet, status); classNames.push_back("Single Quote");
2077     fSets->addElement(fDouble_QuoteSet, status); classNames.push_back("Double Quote");
2078     // Omit Katakana from fSets, which omits Katakana characters
2079     // from the test data. They are all in the dictionary set,
2080     // which this (old, to be retired) monkey test cannot handle.
2081     //fSets->addElement(fKatakanaSet, status);
2082
2083     fSets->addElement(fMidLetterSet, status); classNames.push_back("MidLetter");
2084     fSets->addElement(fMidNumLetSet, status); classNames.push_back("MidNumLet");
2085     fSets->addElement(fMidNumSet, status); classNames.push_back("MidNum");
2086     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2087     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2088     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2089     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2090     fSets->addElement(fExtendNumLetSet, status); classNames.push_back("ExtendNumLet");
2091     fSets->addElement(fWSegSpaceSet, status); classNames.push_back("WSegSpace");
2092
2093     fSets->addElement(fZWJSet, status); classNames.push_back("ZWJ");
2094     fSets->addElement(fExtendedPictSet, status); classNames.push_back("ExtendedPict");
2095
2096     if (U_FAILURE(status)) {
2097         deferredStatus = status;
2098     }
2099 }
2100
2101 void RBBIWordMonkey::setText(const UnicodeString &s) {
2102     fText       = &s;
2103     prepareAppliedRules(s.length());
2104 }
2105
2106
2107 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2108     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2109                               //   break position being tested.  The candidate break
2110                               //   location is before p2.
2111
2112     int     breakPos = -1;
2113
2114     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2115
2116     if (U_FAILURE(deferredStatus)) {
2117         return -1;
2118     }
2119
2120     // Prev break at end of string.  return DONE.
2121     if (prevPos >= fText->length()) {
2122         return -1;
2123     }
2124     p0 = p1 = p2 = p3 = prevPos;
2125     c3 =  fText->char32At(prevPos);
2126     c0 = c1 = c2 = 0;
2127     (void)p0;       // Suppress set but not used warning.
2128
2129     // Loop runs once per "significant" character position in the input text.
2130     for (;;) {
2131         // Move all of the positions forward in the input string.
2132         p0 = p1;  c0 = c1;
2133         p1 = p2;  c1 = c2;
2134         p2 = p3;  c2 = c3;
2135
2136         // Advance p3 by    X(Extend | Format)*   Rule 4
2137         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2138         do {
2139             p3 = fText->moveIndex32(p3, 1);
2140             c3 = fText->char32At(p3);
2141             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2142                break;
2143             }
2144         }
2145         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2146
2147
2148         if (p1 == p2) {
2149             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2150             continue;
2151         }
2152
2153         if (p2 == fText->length()) {
2154             // Reached end of string.  Always a break position.
2155             break;
2156         }
2157
2158         //     No Extend or Format characters may appear between the CR and LF,
2159         //     which requires the additional check for p2 immediately following p1.
2160         //
2161         if (c1==0x0D && c2==0x0A) {
2162           setAppliedRule(p2, "WB3   CR x LF");
2163           continue;
2164         }
2165
2166         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2167             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2168             break;
2169         }
2170         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2171             setAppliedRule(p2, "WB3a  Break before and after newlines (including CR and LF)");
2172             break;
2173         }
2174
2175         //              Not ignoring extend chars, so peek into input text to
2176         //              get the potential ZWJ, the character immediately preceding c2.
2177         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2178         //              but char32At will get the full code point.
2179         if (fZWJSet->contains(fText->char32At(p2 - 1)) && fExtendedPictSet->contains(c2)){
2180             setAppliedRule(p2, "WB3c  ZWJ x Extended_Pictographic");
2181             continue;
2182         }
2183
2184         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2185             setAppliedRule(p2, "WB3d  Keep horizontal whitespace together.");
2186             continue;
2187         }
2188
2189         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2190             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2191             setAppliedRule(p2, "WB4   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)");
2192             continue;
2193         }
2194
2195         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2196              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2197              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2198             setAppliedRule(p2,
2199                            "WB6   (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter _Letter)");
2200             continue;
2201         }
2202
2203         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2204             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2205             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2206             setAppliedRule(p2,
2207                            "WB7   (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)");
2208             continue;
2209         }
2210
2211         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2212             setAppliedRule(p2, "WB7a  Hebrew_Letter x Single_Quote");
2213             continue;
2214         }
2215
2216           if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2217             setAppliedRule(p2, "WB7b  Hebrew_Letter x Double_Quote Hebrew_Letter");
2218             continue;
2219         }
2220
2221         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2222             setAppliedRule(p2, "WB7c  Hebrew_Letter Double_Quote x Hebrew_Letter");
2223             continue;
2224         }
2225
2226         if (fNumericSet->contains(c1) &&
2227             fNumericSet->contains(c2)) {
2228             setAppliedRule(p2, "WB8   Numeric x Numeric");
2229             continue;
2230         }
2231
2232         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2233             fNumericSet->contains(c2)) {
2234             setAppliedRule(p2, "WB9   (ALetter | Hebrew_Letter) x Numeric");
2235             continue;
2236         }
2237
2238         if (fNumericSet->contains(c1) &&
2239             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2240             setAppliedRule(p2, "WB10   Numeric x (ALetter | Hebrew_Letter)");
2241             continue;
2242         }
2243
2244           if (fNumericSet->contains(c0) &&
2245             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2246             fNumericSet->contains(c2)) {
2247             setAppliedRule(p2, "WB11  Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric");
2248             continue;
2249         }
2250
2251         if (fNumericSet->contains(c1) &&
2252             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2253             fNumericSet->contains(c3)) {
2254             setAppliedRule(p2, "WB12  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric");
2255             continue;
2256         }
2257
2258         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2259         //                  all Katakana are handled by the dictionary breaker.
2260         if (fKatakanaSet->contains(c1) &&
2261             fKatakanaSet->contains(c2))  {
2262             setAppliedRule(p2, "WB13  Katakana x Katakana");
2263             continue;
2264         }
2265
2266         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2267              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2268              fExtendNumLetSet->contains(c2)) {
2269             setAppliedRule(p2,
2270                            "WB13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet");
2271             continue;
2272         }
2273
2274         if (fExtendNumLetSet->contains(c1) &&
2275                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2276                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2277             setAppliedRule(p2, "WB13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)");
2278             continue;
2279         }
2280
2281         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2282             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2283             break;
2284         }
2285         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2286             setAppliedRule(p2, "WB15 - WB17   Group pairs of Regional Indicators.");
2287             continue;
2288         }
2289
2290         setAppliedRule(p2, "WB999");
2291         break;
2292     }
2293
2294     breakPos = p2;
2295     return breakPos;
2296 }
2297
2298
2299 UVector  *RBBIWordMonkey::charClasses() {
2300     return fSets;
2301 }
2302
2303 RBBIWordMonkey::~RBBIWordMonkey() {
2304     delete fSets;
2305     delete fCRSet;
2306     delete fLFSet;
2307     delete fNewlineSet;
2308     delete fKatakanaSet;
2309     delete fHebrew_LetterSet;
2310     delete fALetterSet;
2311     delete fSingle_QuoteSet;
2312     delete fDouble_QuoteSet;
2313     delete fMidNumLetSet;
2314     delete fMidLetterSet;
2315     delete fMidNumSet;
2316     delete fNumericSet;
2317     delete fFormatSet;
2318     delete fExtendSet;
2319     delete fExtendNumLetSet;
2320     delete fWSegSpaceSet;
2321     delete fRegionalIndicatorSet;
2322     delete fDictionarySet;
2323     delete fOtherSet;
2324     delete fZWJSet;
2325     delete fExtendedPictSet;
2326 }
2327
2328
2329
2330
2331 //------------------------------------------------------------------------------------------
2332 //
2333 //   class RBBISentMonkey      Sentence Break specific implementation
2334 //                             of RBBIMonkeyKind.
2335 //
2336 //------------------------------------------------------------------------------------------
2337 class RBBISentMonkey: public RBBIMonkeyKind {
2338 public:
2339     RBBISentMonkey();
2340     virtual          ~RBBISentMonkey();
2341     virtual  UVector *charClasses();
2342     virtual  void     setText(const UnicodeString &s);
2343     virtual int32_t   next(int32_t i);
2344 private:
2345     int               moveBack(int posFrom);
2346     int               moveForward(int posFrom);
2347     UChar32           cAt(int pos);
2348
2349     UVector      *fSets;
2350
2351     UnicodeSet  *fSepSet;
2352     UnicodeSet  *fFormatSet;
2353     UnicodeSet  *fSpSet;
2354     UnicodeSet  *fLowerSet;
2355     UnicodeSet  *fUpperSet;
2356     UnicodeSet  *fOLetterSet;
2357     UnicodeSet  *fNumericSet;
2358     UnicodeSet  *fATermSet;
2359     UnicodeSet  *fSContinueSet;
2360     UnicodeSet  *fSTermSet;
2361     UnicodeSet  *fCloseSet;
2362     UnicodeSet  *fOtherSet;
2363     UnicodeSet  *fExtendSet;
2364
2365     const UnicodeString  *fText;
2366 };
2367
2368 RBBISentMonkey::RBBISentMonkey()
2369 {
2370     UErrorCode  status = U_ZERO_ERROR;
2371
2372     fSets            = new UVector(status);
2373
2374     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2375     //                       set and made into character classes of their own.  For the monkey impl,
2376     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2377     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2378     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2379     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2380     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2381     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2382     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2383     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2384     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2385     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2386     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2387     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2388     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2389     fOtherSet        = new UnicodeSet();
2390
2391     if(U_FAILURE(status)) {
2392       deferredStatus = status;
2393       return;
2394     }
2395
2396     fOtherSet->complement();
2397     fOtherSet->removeAll(*fSepSet);
2398     fOtherSet->removeAll(*fFormatSet);
2399     fOtherSet->removeAll(*fSpSet);
2400     fOtherSet->removeAll(*fLowerSet);
2401     fOtherSet->removeAll(*fUpperSet);
2402     fOtherSet->removeAll(*fOLetterSet);
2403     fOtherSet->removeAll(*fNumericSet);
2404     fOtherSet->removeAll(*fATermSet);
2405     fOtherSet->removeAll(*fSContinueSet);
2406     fOtherSet->removeAll(*fSTermSet);
2407     fOtherSet->removeAll(*fCloseSet);
2408     fOtherSet->removeAll(*fExtendSet);
2409
2410     fSets->addElement(fSepSet, status); classNames.push_back("Sep");
2411     fSets->addElement(fFormatSet, status); classNames.push_back("Format");
2412     fSets->addElement(fSpSet, status); classNames.push_back("Sp");
2413     fSets->addElement(fLowerSet, status); classNames.push_back("Lower");
2414     fSets->addElement(fUpperSet, status); classNames.push_back("Upper");
2415     fSets->addElement(fOLetterSet, status); classNames.push_back("OLetter");
2416     fSets->addElement(fNumericSet, status); classNames.push_back("Numeric");
2417     fSets->addElement(fATermSet, status); classNames.push_back("ATerm");
2418     fSets->addElement(fSContinueSet, status); classNames.push_back("SContinue");
2419     fSets->addElement(fSTermSet, status); classNames.push_back("STerm");
2420     fSets->addElement(fCloseSet, status); classNames.push_back("Close");
2421     fSets->addElement(fOtherSet, status); classNames.push_back("Other");
2422     fSets->addElement(fExtendSet, status); classNames.push_back("Extend");
2423
2424     if (U_FAILURE(status)) {
2425         deferredStatus = status;
2426     }
2427 }
2428
2429
2430
2431 void RBBISentMonkey::setText(const UnicodeString &s) {
2432     fText       = &s;
2433     prepareAppliedRules(s.length());
2434 }
2435
2436 UVector  *RBBISentMonkey::charClasses() {
2437     return fSets;
2438 }
2439
2440 //  moveBack()   Find the "significant" code point preceding the index i.
2441 //               Skips over ($Extend | $Format)* .
2442 //
2443 int RBBISentMonkey::moveBack(int i) {
2444     if (i <= 0) {
2445         return -1;
2446     }
2447     UChar32   c;
2448     int32_t   j = i;
2449     do {
2450         j = fText->moveIndex32(j, -1);
2451         c = fText->char32At(j);
2452     }
2453     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2454     return j;
2455
2456  }
2457
2458
2459 int RBBISentMonkey::moveForward(int i) {
2460     if (i>=fText->length()) {
2461         return fText->length();
2462     }
2463     UChar32   c;
2464     int32_t   j = i;
2465     do {
2466         j = fText->moveIndex32(j, 1);
2467         c = cAt(j);
2468     }
2469     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2470     return j;
2471 }
2472
2473 UChar32 RBBISentMonkey::cAt(int pos) {
2474     if (pos<0 || pos>=fText->length()) {
2475         return -1;
2476     } else {
2477         return fText->char32At(pos);
2478     }
2479 }
2480
2481 int32_t RBBISentMonkey::next(int32_t prevPos) {
2482     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2483                               //   break position being tested.  The candidate break
2484                               //   location is before p2.
2485
2486     int     breakPos = -1;
2487
2488     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2489     UChar32 c;
2490
2491     if (U_FAILURE(deferredStatus)) {
2492         return -1;
2493     }
2494
2495     // Prev break at end of string.  return DONE.
2496     if (prevPos >= fText->length()) {
2497         return -1;
2498     }
2499     p0 = p1 = p2 = p3 = prevPos;
2500     c3 =  fText->char32At(prevPos);
2501     c0 = c1 = c2 = 0;
2502     (void)p0;     // Suppress set but not used warning.
2503
2504     // Loop runs once per "significant" character position in the input text.
2505     for (;;) {
2506         // Move all of the positions forward in the input string.
2507         p0 = p1;  c0 = c1;
2508         p1 = p2;  c1 = c2;
2509         p2 = p3;  c2 = c3;
2510
2511         // Advance p3 by    X(Extend | Format)*   Rule 4
2512         p3 = moveForward(p3);
2513         c3 = cAt(p3);
2514
2515         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2516             setAppliedRule(p2, "SB3   CR x LF");
2517             continue;
2518         }
2519
2520         if (fSepSet->contains(c1)) {
2521             p2 = p1+1;   // Separators don't combine with Extend or Format.
2522
2523             setAppliedRule(p2, "SB4   Sep  <break>");
2524             break;
2525         }
2526
2527         if (p2 >= fText->length()) {
2528             // Reached end of string.  Always a break position.
2529             setAppliedRule(p2, "SB4   Sep  <break>");
2530             break;
2531         }
2532
2533         if (p2 == prevPos) {
2534             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2535             setAppliedRule(p2, "SB4   Sep  <break>");
2536             continue;
2537         }
2538
2539         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2540             setAppliedRule(p2, "SB6   ATerm x Numeric");
2541             continue;
2542         }
2543
2544           if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2545                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2546             setAppliedRule(p2, "SB7   (Upper | Lower) ATerm  x  Uppper");
2547             continue;
2548         }
2549
2550         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2551         //                  note to the Unicode 5.0 documents.
2552         int p8 = p1;
2553         while (fSpSet->contains(cAt(p8))) {
2554             p8 = moveBack(p8);
2555         }
2556         while (fCloseSet->contains(cAt(p8))) {
2557             p8 = moveBack(p8);
2558         }
2559         if (fATermSet->contains(cAt(p8))) {
2560             p8=p2;
2561             for (;;) {
2562                 c = cAt(p8);
2563                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2564                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2565                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2566
2567                     setAppliedRule(p2,
2568                                    "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2569                     break;
2570                 }
2571                 p8 = moveForward(p8);
2572             }
2573             if (fLowerSet->contains(cAt(p8))) {
2574
2575                 setAppliedRule(p2,
2576                                "SB8   ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* ");
2577                 continue;
2578             }
2579         }
2580
2581         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2582             p8 = p1;
2583             while (fSpSet->contains(cAt(p8))) {
2584                 p8 = moveBack(p8);
2585             }
2586             while (fCloseSet->contains(cAt(p8))) {
2587                 p8 = moveBack(p8);
2588             }
2589             c = cAt(p8);
2590             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2591                 setAppliedRule(p2, "SB8a  (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm)");
2592                 continue;
2593             }
2594         }
2595
2596         int p9 = p1;
2597         while (fCloseSet->contains(cAt(p9))) {
2598             p9 = moveBack(p9);
2599         }
2600         c = cAt(p9);
2601         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2602             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2603
2604                 setAppliedRule(p2, "SB9  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)");
2605                 continue;
2606             }
2607         }
2608
2609         int p10 = p1;
2610         while (fSpSet->contains(cAt(p10))) {
2611             p10 = moveBack(p10);
2612         }
2613         while (fCloseSet->contains(cAt(p10))) {
2614             p10 = moveBack(p10);
2615         }
2616         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2617             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2618                 setAppliedRule(p2, "SB10  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)");
2619                 continue;
2620             }
2621         }
2622
2623         int p11 = p1;
2624         if (fSepSet->contains(cAt(p11))) {
2625             p11 = moveBack(p11);
2626         }
2627         while (fSpSet->contains(cAt(p11))) {
2628             p11 = moveBack(p11);
2629         }
2630         while (fCloseSet->contains(cAt(p11))) {
2631             p11 = moveBack(p11);
2632         }
2633         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2634           setAppliedRule(p2, "SB11  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>");
2635             break;
2636         }
2637
2638         setAppliedRule(p2, "SB12  Any x Any");
2639         continue;
2640     }
2641
2642     breakPos = p2;
2643     return breakPos;
2644 }
2645
2646 RBBISentMonkey::~RBBISentMonkey() {
2647     delete fSets;
2648     delete fSepSet;
2649     delete fFormatSet;
2650     delete fSpSet;
2651     delete fLowerSet;
2652     delete fUpperSet;
2653     delete fOLetterSet;
2654     delete fNumericSet;
2655     delete fATermSet;
2656     delete fSContinueSet;
2657     delete fSTermSet;
2658     delete fCloseSet;
2659     delete fOtherSet;
2660     delete fExtendSet;
2661 }
2662
2663
2664
2665 //-------------------------------------------------------------------------------------------
2666 //
2667 //  RBBILineMonkey
2668 //
2669 //-------------------------------------------------------------------------------------------
2670
2671 class RBBILineMonkey: public RBBIMonkeyKind {
2672 public:
2673     RBBILineMonkey();
2674     virtual          ~RBBILineMonkey();
2675     virtual  UVector *charClasses();
2676     virtual  void     setText(const UnicodeString &s);
2677     virtual  int32_t  next(int32_t i);
2678     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2679 private:
2680     UVector      *fSets;
2681
2682     UnicodeSet  *fBK;
2683     UnicodeSet  *fCR;
2684     UnicodeSet  *fLF;
2685     UnicodeSet  *fCM;
2686     UnicodeSet  *fNL;
2687     UnicodeSet  *fSG;
2688     UnicodeSet  *fWJ;
2689     UnicodeSet  *fZW;
2690     UnicodeSet  *fGL;
2691     UnicodeSet  *fCB;
2692     UnicodeSet  *fSP;
2693     UnicodeSet  *fB2;
2694     UnicodeSet  *fBA;
2695     UnicodeSet  *fBB;
2696     UnicodeSet  *fHH;
2697     UnicodeSet  *fHY;
2698     UnicodeSet  *fH2;
2699     UnicodeSet  *fH3;
2700     UnicodeSet  *fCL;
2701     UnicodeSet  *fCP;
2702     UnicodeSet  *fEX;
2703     UnicodeSet  *fIN;
2704     UnicodeSet  *fJL;
2705     UnicodeSet  *fJV;
2706     UnicodeSet  *fJT;
2707     UnicodeSet  *fNS;
2708     UnicodeSet  *fOP;
2709     UnicodeSet  *fQU;
2710     UnicodeSet  *fIS;
2711     UnicodeSet  *fNU;
2712     UnicodeSet  *fPO;
2713     UnicodeSet  *fPR;
2714     UnicodeSet  *fSY;
2715     UnicodeSet  *fAI;
2716     UnicodeSet  *fAL;
2717     UnicodeSet  *fCJ;
2718     UnicodeSet  *fHL;
2719     UnicodeSet  *fID;
2720     UnicodeSet  *fRI;
2721     UnicodeSet  *fXX;
2722     UnicodeSet  *fEB;
2723     UnicodeSet  *fEM;
2724     UnicodeSet  *fZWJ;
2725     UnicodeSet  *fOP30;
2726     UnicodeSet  *fCP30;
2727
2728     BreakIterator        *fCharBI;
2729     const UnicodeString  *fText;
2730     RegexMatcher         *fNumberMatcher;
2731 };
2732
2733 RBBILineMonkey::RBBILineMonkey() :
2734     RBBIMonkeyKind(),
2735     fSets(NULL),
2736
2737     fCharBI(NULL),
2738     fText(NULL),
2739     fNumberMatcher(NULL)
2740
2741 {
2742     if (U_FAILURE(deferredStatus)) {
2743         return;
2744     }
2745
2746     UErrorCode  status = U_ZERO_ERROR;
2747
2748     fSets  = new UVector(status);
2749
2750     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2751     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2752     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2753     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2754     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2755     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2756     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2757     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2758     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2759     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2760     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2761     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2762     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2763     fHH    = new UnicodeSet();
2764     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2765     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2766     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2767     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=CL}] [\\u201D]]"), status); // en adjustments for rdar://problem/51193810
2768     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2769     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2770     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2771     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2772     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2773     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2774     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2775     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=OP}] [\\u201C\\u2018]]"), status); // en adjustments
2776     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=QU}]-[\\u201C\\u2018\\u201D]]"), status); // en adjustments
2777     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2778     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2779     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2780     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2781     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2782     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2783     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2784     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2785     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2786     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2787     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2788     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2789     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2790     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2791     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2792     fZWJ   = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2793     fOP30  = new UnicodeSet(u"[[\\p{Line_break=OP} [\\u201C\\u2018]]-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status); // en adjustments
2794     fCP30  = new UnicodeSet(u"[\\p{Line_break=CP}-[\\p{ea=F}\\p{ea=W}\\p{ea=H}]]", status);
2795
2796     if (U_FAILURE(status)) {
2797         deferredStatus = status;
2798         return;
2799     }
2800
2801     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2802     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2803     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2804
2805     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2806     fCM->addAll(*fZWJ);    // ZWJ behaves as a CM.
2807
2808     fHH->add(u'\u2010');   // Hyphen, '‐'
2809
2810     // Sets and names.
2811     fSets->addElement(fBK, status); classNames.push_back("fBK");
2812     fSets->addElement(fCR, status); classNames.push_back("fCR");
2813     fSets->addElement(fLF, status); classNames.push_back("fLF");
2814     fSets->addElement(fCM, status); classNames.push_back("fCM");
2815     fSets->addElement(fNL, status); classNames.push_back("fNL");
2816     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2817     fSets->addElement(fZW, status); classNames.push_back("fZW");
2818     fSets->addElement(fGL, status); classNames.push_back("fGL");
2819     fSets->addElement(fCB, status); classNames.push_back("fCB");
2820     fSets->addElement(fSP, status); classNames.push_back("fSP");
2821     fSets->addElement(fB2, status); classNames.push_back("fB2");
2822     fSets->addElement(fBA, status); classNames.push_back("fBA");
2823     fSets->addElement(fBB, status); classNames.push_back("fBB");
2824     fSets->addElement(fHY, status); classNames.push_back("fHY");
2825     fSets->addElement(fH2, status); classNames.push_back("fH2");
2826     fSets->addElement(fH3, status); classNames.push_back("fH3");
2827     fSets->addElement(fCL, status); classNames.push_back("fCL");
2828     fSets->addElement(fCP, status); classNames.push_back("fCP");
2829     fSets->addElement(fEX, status); classNames.push_back("fEX");
2830     fSets->addElement(fIN, status); classNames.push_back("fIN");
2831     fSets->addElement(fJL, status); classNames.push_back("fJL");
2832     fSets->addElement(fJT, status); classNames.push_back("fJT");
2833     fSets->addElement(fJV, status); classNames.push_back("fJV");
2834     fSets->addElement(fNS, status); classNames.push_back("fNS");
2835     fSets->addElement(fOP, status); classNames.push_back("fOP");
2836     fSets->addElement(fQU, status); classNames.push_back("fQU");
2837     fSets->addElement(fIS, status); classNames.push_back("fIS");
2838     fSets->addElement(fNU, status); classNames.push_back("fNU");
2839     fSets->addElement(fPO, status); classNames.push_back("fPO");
2840     fSets->addElement(fPR, status); classNames.push_back("fPR");
2841     fSets->addElement(fSY, status); classNames.push_back("fSY");
2842     fSets->addElement(fAI, status); classNames.push_back("fAI");
2843     fSets->addElement(fAL, status); classNames.push_back("fAL");
2844     fSets->addElement(fHL, status); classNames.push_back("fHL");
2845     fSets->addElement(fID, status); classNames.push_back("fID");
2846     fSets->addElement(fWJ, status); classNames.push_back("fWJ");
2847     fSets->addElement(fRI, status); classNames.push_back("fRI");
2848     fSets->addElement(fSG, status); classNames.push_back("fSG");
2849     fSets->addElement(fEB, status); classNames.push_back("fEB");
2850     fSets->addElement(fEM, status); classNames.push_back("fEM");
2851     fSets->addElement(fZWJ, status); classNames.push_back("fZWJ");
2852     // TODO: fOP30 & fCP30 overlap with plain fOP. Probably OK, but fOP/CP chars will be over-represented.
2853     fSets->addElement(fOP30, status); classNames.push_back("fOP30");
2854     fSets->addElement(fCP30, status); classNames.push_back("fCP30");
2855
2856     const char *rules =
2857             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2858             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2859             "((\\p{Line_Break=IS})(\\p{Line_Break=CM}|\\u200d)*)?"
2860             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2861             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2862             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2863             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2864
2865     fNumberMatcher = new RegexMatcher(
2866         UnicodeString(rules, -1, US_INV), 0, status);
2867
2868     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2869
2870     if (U_FAILURE(status)) {
2871         deferredStatus = status;
2872     }
2873
2874 }
2875
2876
2877 void RBBILineMonkey::setText(const UnicodeString &s) {
2878     fText       = &s;
2879     fCharBI->setText(s);
2880     prepareAppliedRules(s.length());
2881     fNumberMatcher->reset(s);
2882 }
2883
2884 //
2885 //  rule9Adjust
2886 //     Line Break TR rules 9 and 10 implementation.
2887 //     This deals with combining marks and other sequences that
2888 //     that must be treated as if they were something other than what they actually are.
2889 //
2890 //     This is factored out into a separate function because it must be applied twice for
2891 //     each potential break, once to the chars before the position being checked, then
2892 //     again to the text following the possible break.
2893 //
2894 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2895     if (pos == -1) {
2896         // Invalid initial position.  Happens during the warmup iteration of the
2897         //   main loop in next().
2898         return;
2899     }
2900
2901     int32_t  nPos = *nextPos;
2902
2903     // LB 9  Keep combining sequences together.
2904     // advance over any CM class chars.  Note that Line Break CM is different
2905     // from the normal Grapheme Extend property.
2906     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2907           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2908         for (;;) {
2909             *nextChar = fText->char32At(nPos);
2910             if (!fCM->contains(*nextChar)) {
2911                 break;
2912             }
2913             nPos = fText->moveIndex32(nPos, 1);
2914         }
2915     }
2916
2917
2918     // LB 9 Treat X CM* as if it were x.
2919     //       No explicit action required.
2920
2921     // LB 10  Treat any remaining combining mark as AL
2922     if (fCM->contains(*posChar)) {
2923         *posChar = u'A';
2924     }
2925
2926     // Push the updated nextPos and nextChar back to our caller.
2927     // This only makes a difference if posChar got bigger by consuming a
2928     // combining sequence.
2929     *nextPos  = nPos;
2930     *nextChar = fText->char32At(nPos);
2931 }
2932
2933
2934
2935 int32_t RBBILineMonkey::next(int32_t startPos) {
2936     UErrorCode status = U_ZERO_ERROR;
2937     int32_t    pos;       //  Index of the char following a potential break position
2938     UChar32    thisChar;  //  Character at above position "pos"
2939
2940     int32_t    prevPos;   //  Index of the char preceding a potential break position
2941     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2942                           //   and thisChar may not be adjacent because combining
2943                           //   characters between them will be ignored.
2944
2945     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2946     UChar32    prevCharX2;
2947
2948     int32_t    nextPos;   //  Index of the next character following pos.
2949                           //     Usually skips over combining marks.
2950     int32_t    nextCPPos; //  Index of the code point following "pos."
2951                           //     May point to a combining mark.
2952     int32_t    tPos;      //  temp value.
2953     UChar32    c;
2954
2955     if (U_FAILURE(deferredStatus)) {
2956         return -1;
2957     }
2958
2959     if (startPos >= fText->length()) {
2960         return -1;
2961     }
2962
2963
2964     // Initial values for loop.  Loop will run the first time without finding breaks,
2965     //                           while the invalid values shift out and the "this" and
2966     //                           "prev" positions are filled in with good values.
2967     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2968     thisChar = prevChar  = prevCharX2 = 0;
2969     nextPos  = nextCPPos = startPos;
2970
2971
2972     // Loop runs once per position in the test text, until a break position
2973     //  is found.
2974     for (;;) {
2975         prevPosX2 = prevPos;
2976         prevCharX2 = prevChar;
2977
2978         prevPos   = pos;
2979         prevChar  = thisChar;
2980
2981         pos       = nextPos;
2982         thisChar  = fText->char32At(pos);
2983
2984         nextCPPos = fText->moveIndex32(pos, 1);
2985         nextPos   = nextCPPos;
2986
2987
2988         if (pos >= fText->length()) {
2989             setAppliedRule(pos, "LB2 - Break at end of text.");
2990             break;
2991         }
2992
2993
2994         //             We do this one out-of-order because the adjustment does not change anything
2995         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2996         //             be applied.
2997         rule9Adjust(prevPos, &prevChar, &pos, &thisChar);
2998         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2999         c = fText->char32At(nextPos);
3000         rule9Adjust(pos, &thisChar, &nextPos, &c);
3001
3002         // If the loop is still warming up - if we haven't shifted the initial
3003         //   -1 positions out of prevPos yet - loop back to advance the
3004         //    position in the input without any further looking for breaks.
3005         if (prevPos == -1) {
3006           setAppliedRule(pos, "LB 9 - adjust for combining sequences.");
3007             continue;
3008         }
3009
3010
3011         if (fBK->contains(prevChar)) {
3012             setAppliedRule(pos, "LB 4  Always break after hard line breaks");
3013             break;
3014         }
3015
3016
3017         if (prevChar == 0x0d && thisChar == 0x0a) {
3018             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3019             continue;
3020         }
3021         if (prevChar == 0x0d ||
3022             prevChar == 0x0a ||
3023             prevChar == 0x85)  {
3024             setAppliedRule(pos, "LB 5  Break after CR, LF, NL, but not inside CR LF");
3025             break;
3026         }
3027
3028
3029         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030             fBK->contains(thisChar)) {
3031             setAppliedRule(pos, "LB 6  Don't break before hard line breaks");
3032             continue;
3033         }
3034
3035
3036         if (fSP->contains(thisChar)) {
3037             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3038             continue;
3039         }
3040
3041         // !!! ??? Is this the right text for the applied rule?
3042         if (fZW->contains(thisChar)) {
3043             setAppliedRule(pos, "LB 7  Don't break before spaces or zero-width space.");
3044             continue;
3045         }
3046
3047
3048         //       ZW SP* ÷
3049         //       Scan backwards from prevChar for SP* ZW
3050         tPos = prevPos;
3051         while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3052             tPos = fText->moveIndex32(tPos, -1);
3053         }
3054         if (fZW->contains(fText->char32At(tPos))) {
3055             setAppliedRule(pos, "LB 8  Break after zero width space");
3056             break;
3057         }
3058
3059
3060         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
3061         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
3062         if (fNumberMatcher->lookingAt(prevPos, status)) {
3063             if (U_FAILURE(status)) {
3064                 setAppliedRule(pos, "LB 25 Numbers");
3065                 break;
3066             }
3067             // Matched a number.  But could have been just a single digit, which would
3068             //    not represent a "no break here" between prevChar and thisChar
3069             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3070             if (numEndIdx > pos) {
3071                 // Number match includes at least our two chars being checked
3072                 if (numEndIdx > nextPos) {
3073                     // Number match includes additional chars.  Update pos and nextPos
3074                     //   so that next loop iteration will continue at the end of the number,
3075                     //   checking for breaks between last char in number & whatever follows.
3076                     pos = nextPos = numEndIdx;
3077                     do {
3078                         pos = fText->moveIndex32(pos, -1);
3079                         thisChar = fText->char32At(pos);
3080                     } while (fCM->contains(thisChar));
3081                 }
3082                 setAppliedRule(pos, "LB 25 Numbers");
3083                 continue;
3084             }
3085         }
3086
3087
3088         //       The monkey test's way of ignoring combining characters doesn't work
3089         //       for this rule. ZJ is also a CM. Need to get the actual character
3090         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3091         {
3092             int32_t prevIdx = fText->moveIndex32(pos, -1);
3093             UChar32 prevC = fText->char32At(prevIdx);
3094             if (fZWJ->contains(prevC)) {
3095                 setAppliedRule(pos, "LB 8a ZWJ x");
3096                 continue;
3097             }
3098         }
3099
3100
3101         // appliedRule: "LB 9, 10"; //  Already done, at top of loop.";
3102         //
3103
3104
3105         //    x  WJ
3106         //    WJ  x
3107         //
3108         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3109             setAppliedRule(pos, "LB 11  Do not break before or after WORD JOINER and related characters.");
3110             continue;
3111         }
3112
3113
3114         if (fGL->contains(prevChar)) {
3115             setAppliedRule(pos, "LB 12  GL  x");
3116             continue;
3117         }
3118
3119
3120           if (!(fSP->contains(prevChar) ||
3121               fBA->contains(prevChar) ||
3122               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3123               setAppliedRule(pos, "LB 12a  [^SP BA HY] x GL");
3124               continue;
3125         }
3126
3127
3128         if (fCL->contains(thisChar) ||
3129                 fCP->contains(thisChar) ||
3130                 fEX->contains(thisChar) ||
3131                 fSY->contains(thisChar)) {
3132             setAppliedRule(pos, "LB 13  Don't break before closings.");
3133             continue;
3134         }
3135
3136
3137         //       Scan backwards, checking for this sequence.
3138         //       The OP char could include combining marks, so we actually check for
3139         //           OP CM* SP*
3140         //       Another Twist: The Rule 9 fixes may have changed a SP CM
3141         //       sequence into a ID char, so before scanning back through spaces,
3142         //       verify that prevChar is indeed a space.  The prevChar variable
3143         //       may differ from fText[prevPos]
3144         tPos = prevPos;
3145         if (fSP->contains(prevChar)) {
3146             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3147                 tPos=fText->moveIndex32(tPos, -1);
3148             }
3149         }
3150         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3151             tPos=fText->moveIndex32(tPos, -1);
3152         }
3153         if (fOP->contains(fText->char32At(tPos))) {
3154             setAppliedRule(pos, "LB 14 Don't break after OP SP*");
3155             continue;
3156         }
3157
3158
3159         if (nextPos < fText->length()) {
3160             // note: UnicodeString::char32At(length) returns ffff, not distinguishable
3161             //       from a legit ffff character. So test length separately.
3162             UChar32 nextChar = fText->char32At(nextPos);
3163             if (fSP->contains(prevChar) && fIS->contains(thisChar) && fNU->contains(nextChar)) {
3164                 setAppliedRule(pos, "LB 14a Break before an IS that begins a number and follows a space");
3165                 break;
3166             }
3167         }
3168
3169
3170           if (fIS->contains(thisChar)) {
3171               setAppliedRule(pos, "LB 14b  Do not break before numeric separators, even after spaces.");
3172               continue;
3173         }
3174
3175
3176         if (fOP->contains(thisChar)) {
3177             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3178             int tPos = prevPos;
3179             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3180                 tPos = fText->moveIndex32(tPos, -1);
3181             }
3182             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3183                 tPos = fText->moveIndex32(tPos, -1);
3184             }
3185             if (fQU->contains(fText->char32At(tPos))) {
3186                 setAppliedRule(pos, "LB 15    QU SP* x OP");
3187                 continue;
3188             }
3189         }
3190
3191
3192         //    Scan backwards for SP* CM* (CL | CP)
3193         if (fNS->contains(thisChar)) {
3194             int tPos = prevPos;
3195             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3196                 tPos = fText->moveIndex32(tPos, -1);
3197             }
3198             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3199                 tPos = fText->moveIndex32(tPos, -1);
3200             }
3201             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3202                 setAppliedRule(pos, "LB 16   (CL | CP) SP* x NS");
3203                 continue;
3204             }
3205         }
3206
3207
3208         if (fB2->contains(thisChar)) {
3209             //  Scan backwards, checking for the B2 CM* SP* sequence.
3210             tPos = prevPos;
3211             if (fSP->contains(prevChar)) {
3212                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3213                     tPos=fText->moveIndex32(tPos, -1);
3214                 }
3215             }
3216             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3217                 tPos=fText->moveIndex32(tPos, -1);
3218             }
3219             if (fB2->contains(fText->char32At(tPos))) {
3220                 setAppliedRule(pos, "LB 17   B2 SP* x B2");
3221                 continue;
3222             }
3223         }
3224
3225
3226         if (fSP->contains(prevChar)) {
3227             setAppliedRule(pos, "LB 18    break after space");
3228             break;
3229         }
3230
3231         //    x   QU
3232         //    QU  x
3233         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3234             setAppliedRule(pos, "LB 19");
3235             continue;
3236         }
3237
3238         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3239             setAppliedRule(pos, "LB 20  Break around a CB");
3240             break;
3241         }
3242
3243         //           Don't break between Hyphens and letters if a break precedes the hyphen.
3244         //           Formerly this was a Finnish tailoring.
3245         //           Moved to root in ICU 63. This is an ICU customization, not in UAX-14.
3246         //           ^($HY | $HH) $AL;
3247         if (fAL->contains(thisChar) && (fHY->contains(prevChar) || fHH->contains(prevChar)) &&
3248                 prevPosX2 == -1) {
3249             setAppliedRule(pos, "LB 20.09");
3250             continue;
3251         }
3252
3253         if (fBA->contains(thisChar) ||
3254             fHY->contains(thisChar) ||
3255             fNS->contains(thisChar) ||
3256             fBB->contains(prevChar) )   {
3257             setAppliedRule(pos, "LB 21");
3258             continue;
3259         }
3260
3261         if (fHL->contains(prevCharX2) &&
3262                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3263             setAppliedRule(pos, "LB 21a   HL (HY | BA) x");
3264             continue;
3265         }
3266
3267         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3268             setAppliedRule(pos, "LB 21b SY x HL");
3269             continue;
3270         }
3271
3272         if (fIN->contains(thisChar))   {
3273             setAppliedRule(pos, "LB 22");
3274             continue;
3275         }
3276
3277
3278         //          (AL | HL) x NU
3279         //          NU x (AL | HL)
3280         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3281             setAppliedRule(pos, "LB 23");
3282             continue;
3283         }
3284         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3285             setAppliedRule(pos, "LB 23");
3286             continue;
3287         }
3288
3289         // Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3290         //      PR x (ID | EB | EM)
3291         //     (ID | EB | EM) x PO
3292         if (fPR->contains(prevChar) &&
3293                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3294             setAppliedRule(pos, "LB 23a");
3295             continue;
3296         }
3297         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3298                 fPO->contains(thisChar)) {
3299             setAppliedRule(pos, "LB 23a");
3300             continue;
3301         }
3302
3303         //   Do not break between prefix and letters or ideographs.
3304         //         (PR | PO) x (AL | HL)
3305         //         (AL | HL) x (PR | PO)
3306         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3307                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3308             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3309             continue;
3310         }
3311         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3312                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3313             setAppliedRule(pos, "LB 24 no break between prefix and letters or ideographs");
3314             continue;
3315         }
3316
3317         // appliedRule: "LB 25 numbers match"; // moved up, before LB 8a,
3318
3319         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3320                                         fJV->contains(thisChar) ||
3321                                         fH2->contains(thisChar) ||
3322                                         fH3->contains(thisChar))) {
3323             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3324             continue;
3325                                         }
3326
3327         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3328             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3329             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3330             continue;
3331         }
3332
3333         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3334             fJT->contains(thisChar)) {
3335             setAppliedRule(pos, "LB 26 Do not break a Korean syllable.");
3336             continue;
3337         }
3338
3339         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3340             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3341             fIN->contains(thisChar)) {
3342             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3343             continue;
3344             }
3345         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3346             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3347             fPO->contains(thisChar)) {
3348             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3349             continue;
3350             }
3351         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3352             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3353             setAppliedRule(pos, "LB 27 Treat a Korean Syllable Block the same as ID.");
3354             continue;
3355             }
3356
3357
3358
3359         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3360             setAppliedRule(pos, "LB 28  Do not break between alphabetics (\"at\").");
3361             continue;
3362         }
3363
3364           if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3365               setAppliedRule(pos, "LB 29  Do not break between numeric punctuation and alphabetics (\"e.g.\").");
3366               continue;
3367         }
3368
3369         //          (AL | NU) x OP
3370         //          CP x (AL | NU)
3371         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP30->contains(thisChar)) {
3372             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3373             continue;
3374         }
3375         if (fCP30->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3376             setAppliedRule(pos,  "LB 30 No break in letters, numbers, or ordinary symbols, opening/closing punctuation.");
3377             continue;
3378         }
3379
3380         //             RI  x  RI
3381         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3382             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3383             break;
3384         }
3385         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3386             // Two Regional Indicators have been paired.
3387             // Over-write the trailing one (thisChar) to prevent it from forming another pair with a
3388             // following RI. This is a hack.
3389             thisChar = -1;
3390             setAppliedRule(pos, "LB30a    RI RI  ÷  RI");
3391             continue;
3392         }
3393
3394         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3395             setAppliedRule(pos, "LB30b    Emoji Base x Emoji Modifier");
3396             continue;
3397         }
3398
3399         setAppliedRule(pos, "LB 31    Break everywhere else");
3400         break;
3401     }
3402
3403     return pos;
3404 }
3405
3406
3407 UVector  *RBBILineMonkey::charClasses() {
3408     return fSets;
3409 }
3410
3411
3412 RBBILineMonkey::~RBBILineMonkey() {
3413     delete fSets;
3414
3415     delete fBK;
3416     delete fCR;
3417     delete fLF;
3418     delete fCM;
3419     delete fNL;
3420     delete fWJ;
3421     delete fZW;
3422     delete fGL;
3423     delete fCB;
3424     delete fSP;
3425     delete fB2;
3426     delete fBA;
3427     delete fBB;
3428     delete fHH;
3429     delete fHY;
3430     delete fH2;
3431     delete fH3;
3432     delete fCL;
3433     delete fCP;
3434     delete fEX;
3435     delete fIN;
3436     delete fJL;
3437     delete fJV;
3438     delete fJT;
3439     delete fNS;
3440     delete fOP;
3441     delete fQU;
3442     delete fIS;
3443     delete fNU;
3444     delete fPO;
3445     delete fPR;
3446     delete fSY;
3447     delete fAI;
3448     delete fAL;
3449     delete fCJ;
3450     delete fHL;
3451     delete fID;
3452     delete fRI;
3453     delete fSG;
3454     delete fXX;
3455     delete fEB;
3456     delete fEM;
3457     delete fZWJ;
3458     delete fOP30;
3459     delete fCP30;
3460
3461     delete fCharBI;
3462     delete fNumberMatcher;
3463 }
3464
3465
3466 //-------------------------------------------------------------------------------------------
3467 //
3468 //   TestMonkey
3469 //
3470 //     params
3471 //       seed=nnnnn        Random number starting seed.
3472 //                         Setting the seed allows errors to be reproduced.
3473 //       loop=nnn          Looping count.  Controls running time.
3474 //                         -1:  run forever.
3475 //                          0 or greater:  run length.
3476 //
3477 //       type = char | word | line | sent | title
3478 //
3479 //  Example:
3480 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3481 //
3482 //-------------------------------------------------------------------------------------------
3483
3484 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3485     int32_t val = defaultVal;
3486     name.append(" *= *(-?\\d+)");
3487     UErrorCode status = U_ZERO_ERROR;
3488     RegexMatcher m(name, params, 0, status);
3489     if (m.find()) {
3490         // The param exists.  Convert the string to an int.
3491         char valString[100];
3492         int32_t paramLength = m.end(1, status) - m.start(1, status);
3493         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3494             paramLength = (int32_t)(sizeof(valString)-2);
3495         }
3496         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3497         val = strtol(valString, NULL, 10);
3498
3499         // Delete this parameter from the params string.
3500         m.reset();
3501         params = m.replaceFirst("", status);
3502     }
3503     U_ASSERT(U_SUCCESS(status));
3504     return val;
3505 }
3506 #endif
3507
3508 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3509 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3510                                     BreakIterator *bi,
3511                                     int expected[],
3512                                     int expectedcount)
3513 {
3514     int count = 0;
3515     int i = 0;
3516     int forward[50];
3517     bi->setText(ustr);
3518     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3519         forward[count] = i;
3520         if (count < expectedcount && expected[count] != i) {
3521             test->errln("%s:%d break forward test failed: expected %d but got %d",
3522                         __FILE__, __LINE__, expected[count], i);
3523             break;
3524         }
3525         count ++;
3526     }
3527     if (count != expectedcount) {
3528         printStringBreaks(ustr, expected, expectedcount);
3529         test->errln("%s:%d break forward test failed: missed %d match",
3530                     __FILE__, __LINE__, expectedcount - count);
3531         return;
3532     }
3533     // testing boundaries
3534     for (i = 1; i < expectedcount; i ++) {
3535         int j = expected[i - 1];
3536         if (!bi->isBoundary(j)) {
3537             printStringBreaks(ustr, expected, expectedcount);
3538             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3539                     __FILE__, __LINE__, j);
3540             return;
3541         }
3542         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3543             if (bi->isBoundary(j)) {
3544                 printStringBreaks(ustr, expected, expectedcount);
3545                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3546                     __FILE__, __LINE__, j);
3547                 return;
3548             }
3549         }
3550     }
3551
3552     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3553         count --;
3554         if (forward[count] != i) {
3555             printStringBreaks(ustr, expected, expectedcount);
3556             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3557                         __FILE__, __LINE__, forward[count], i);
3558             break;
3559         }
3560     }
3561     if (count != 0) {
3562         printStringBreaks(ustr, expected, expectedcount);
3563         test->errln("break test previous() failed: missed a match");
3564         return;
3565     }
3566
3567     // testing preceding
3568     for (i = 0; i < expectedcount - 1; i ++) {
3569         // int j = expected[i] + 1;
3570         int j = ustr.moveIndex32(expected[i], 1);
3571         for (; j <= expected[i + 1]; j ++) {
3572             int32_t expectedPreceding = expected[i];
3573             int32_t actualPreceding = bi->preceding(j);
3574             if (actualPreceding != expectedPreceding) {
3575                 printStringBreaks(ustr, expected, expectedcount);
3576                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3577                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3578                 return;
3579             }
3580         }
3581     }
3582 }
3583 #endif
3584
3585 void RBBITest::TestWordBreaks(void)
3586 {
3587 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3588
3589     Locale        locale("en");
3590     UErrorCode    status = U_ZERO_ERROR;
3591     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3592     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3593     // Replaced any C+J characters in a row with a random sequence of characters
3594     // of the same length to make our C+J segmentation not get in the way.
3595     static const char *strlist[] =
3596     {
3597     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3598     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3599     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3600     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3601     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3602     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3603     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3604     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3605     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3606     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3607     "\\u2027\\U000e0067\\u0a47\\u00b7",
3608     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3609     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3610     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3611     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3612     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3613     "\\u0027\\u11af\\U000e0057\\u0602",
3614     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3615     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3616     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3617     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3618     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3619     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3620     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3621     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3622     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3623     "\\u18f4\\U000e0049\\u20e7\\u2027",
3624     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3625     "\\ua183\\u102d\\u0bec\\u003a",
3626     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3627     "\\u003a\\u0e57\\u0fad\\u002e",
3628     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3629     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3630     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3631     "\\u003a\\u0664\\u00b7\\u1fba",
3632     "\\u003b\\u0027\\u00b7\\u47a3",
3633     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3634     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3635     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3636     };
3637     int loop;
3638     if (U_FAILURE(status)) {
3639         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3640         return;
3641     }
3642     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3643         // printf("looping %d\n", loop);
3644         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3645         // RBBICharMonkey monkey;
3646         RBBIWordMonkey monkey;
3647
3648         int expected[50];
3649         int expectedcount = 0;
3650
3651         monkey.setText(ustr);
3652         int i;
3653         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3654             expected[expectedcount ++] = i;
3655         }
3656
3657         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3658     }
3659     delete bi;
3660 #endif
3661 }
3662
3663 void RBBITest::TestWordBoundary(void)
3664 {
3665     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3666     Locale        locale("en");
3667     UErrorCode    status = U_ZERO_ERROR;
3668     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3669     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3670     if (U_FAILURE(status)) {
3671         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3672                 __FILE__, __LINE__, u_errorName(status));
3673         return;
3674     }
3675     UChar         str[50];
3676     static const char *strlist[] =
3677     {
3678     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3679     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3680     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3681     "\\u2027\\U000e0067\\u0a47\\u00b7",
3682     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3683     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3684     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3685     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3686     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3687     "\\u0027\\u11af\\U000e0057\\u0602",
3688     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3689     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3690     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3691     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3692     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3693     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3694     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3695     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3696     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3697     "\\u58f4\\U000e0049\\u20e7\\u2027",
3698     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3699     "\\ua183\\u102d\\u0bec\\u003a",
3700     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3701     "\\u003a\\u0e57\\u0fad\\u002e",
3702     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3703     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3704     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3705     "\\u003a\\u0664\\u00b7\\u1fba",
3706     "\\u003b\\u0027\\u00b7\\u47a3",
3707     };
3708     int loop;
3709     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3710         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3711         UnicodeString ustr(str);
3712         int forward[50];
3713         int count = 0;
3714
3715         bi->setText(ustr);
3716         int prev = -1;
3717         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3718             ++count;
3719             if (count >= UPRV_LENGTHOF(forward)) {
3720                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3721                         __FILE__, __LINE__, loop, count, boundary);
3722                 return;
3723             }
3724             forward[count] = boundary;
3725             if (boundary <= prev) {
3726                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3727                         __FILE__, __LINE__, loop, prev, boundary);
3728                 break;
3729             }
3730             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3731                 if (bi->isBoundary(nonBoundary)) {
3732                     printStringBreaks(ustr, forward, count);
3733                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3734                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3735                     return;
3736                 }
3737             }
3738             if (!bi->isBoundary(boundary)) {
3739                 printStringBreaks(ustr, forward, count);
3740                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3741                        __FILE__, __LINE__, boundary);
3742                 return;
3743             }
3744             prev = boundary;
3745         }
3746     }
3747 }
3748
3749 void RBBITest::TestLineBreaks(void)
3750 {
3751 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3752     Locale        locale("en");
3753     UErrorCode    status = U_ZERO_ERROR;
3754     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3755     const int32_t  STRSIZE = 50;
3756     UChar         str[STRSIZE];
3757     static const char *strlist[] =
3758     {
3759      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3760      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3761              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3762      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3763              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3764      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3765      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3766      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3767      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3768      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3769      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3770      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3771      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3772      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3773      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3774      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3775      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3776      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3777      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3778      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3779      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3780      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3781      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3782      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3783      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3784      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3785      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3786      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3787      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3788      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3789      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3790      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3791      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3792      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3793      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3794      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3795      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3796      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3797      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3798          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3799     };
3800     int loop;
3801     TEST_ASSERT_SUCCESS(status);
3802     if (U_FAILURE(status)) {
3803         return;
3804     }
3805     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3806         // printf("looping %d\n", loop);
3807         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3808         if (t >= STRSIZE) {
3809             TEST_ASSERT(FALSE);
3810             continue;
3811         }
3812
3813
3814         UnicodeString ustr(str);
3815         RBBILineMonkey monkey;
3816         if (U_FAILURE(monkey.deferredStatus)) {
3817             continue;
3818         }
3819
3820         const int EXPECTEDSIZE = 50;
3821         int expected[EXPECTEDSIZE];
3822         int expectedcount = 0;
3823
3824         monkey.setText(ustr);
3825
3826         int i;
3827         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3828             if (expectedcount >= EXPECTEDSIZE) {
3829                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3830                 return;
3831             }
3832             expected[expectedcount ++] = i;
3833         }
3834
3835         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3836     }
3837     delete bi;
3838 #endif
3839 }
3840
3841 void RBBITest::TestSentBreaks(void)
3842 {
3843 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3844     Locale        locale("en");
3845     UErrorCode    status = U_ZERO_ERROR;
3846     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3847     UChar         str[200];
3848     static const char *strlist[] =
3849     {
3850      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3851      "This\n",
3852      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3853      "\"Sentence ending with a quote.\" Bye.",
3854      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3855      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3856      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3857      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3858      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3859      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3860      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3861              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3862              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3863              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3864      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3865              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3866              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3867              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3868              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3869              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3870     };
3871     int loop;
3872     if (U_FAILURE(status)) {
3873         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3874         return;
3875     }
3876     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3877         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3878         UnicodeString ustr(str);
3879
3880         RBBISentMonkey monkey;
3881         if (U_FAILURE(monkey.deferredStatus)) {
3882             continue;
3883         }
3884
3885         const int EXPECTEDSIZE = 50;
3886         int expected[EXPECTEDSIZE];
3887         int expectedcount = 0;
3888
3889         monkey.setText(ustr);
3890
3891         int i;
3892         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3893             if (expectedcount >= EXPECTEDSIZE) {
3894                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3895                 return;
3896             }
3897             expected[expectedcount ++] = i;
3898         }
3899
3900         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3901     }
3902     delete bi;
3903 #endif
3904 }
3905
3906 void RBBITest::TestMonkey() {
3907 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3908
3909     UErrorCode     status    = U_ZERO_ERROR;
3910     int32_t        loopCount = 500;
3911     int32_t        seed      = 1;
3912     UnicodeString  breakType = "all";
3913     Locale         locale("en");
3914     UBool          useUText  = FALSE;
3915
3916     if (quick == FALSE) {
3917         loopCount = 10000;
3918     }
3919
3920     if (fTestParams) {
3921         UnicodeString p(fTestParams);
3922         loopCount = getIntParam("loop", p, loopCount);
3923         seed      = getIntParam("seed", p, seed);
3924
3925         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3926         if (m.find()) {
3927             breakType = m.group(1, status);
3928             m.reset();
3929             p = m.replaceFirst("", status);
3930         }
3931
3932         RegexMatcher u(" *utext", p, 0, status);
3933         if (u.find()) {
3934             useUText = TRUE;
3935             u.reset();
3936             p = u.replaceFirst("", status);
3937         }
3938
3939
3940         // m.reset(p);
3941         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3942             // Each option is stripped out of the option string as it is processed.
3943             // All options have been checked.  The option string should have been completely emptied..
3944             char buf[100];
3945             p.extract(buf, sizeof(buf), NULL, status);
3946             buf[sizeof(buf)-1] = 0;
3947             errln("Unrecognized or extra parameter:  %s\n", buf);
3948             return;
3949         }
3950
3951     }
3952
3953     if (breakType == "char" || breakType == "all") {
3954         RBBICharMonkey  m;
3955         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3956         if (U_SUCCESS(status)) {
3957             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3958             if (breakType == "all" && useUText==FALSE) {
3959                 // Also run a quick test with UText when "all" is specified
3960                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3961             }
3962         }
3963         else {
3964             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3965         }
3966         delete bi;
3967     }
3968
3969     if (breakType == "word" || breakType == "all") {
3970         logln("Word Break Monkey Test");
3971         RBBIWordMonkey  m;
3972         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3973         if (U_SUCCESS(status)) {
3974             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3975         }
3976         else {
3977             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3978         }
3979         delete bi;
3980     }
3981
3982     if (breakType == "line" || breakType == "all") {
3983         logln("Line Break Monkey Test");
3984         RBBILineMonkey  m;
3985         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3986         if (loopCount >= 10) {
3987             loopCount = loopCount / 5;   // Line break runs slower than the others.
3988         }
3989         if (U_SUCCESS(status)) {
3990             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3991         }
3992         else {
3993             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3994         }
3995         delete bi;
3996     }
3997
3998     if (breakType == "sent" || breakType == "all"  ) {
3999         logln("Sentence Break Monkey Test");
4000         RBBISentMonkey  m;
4001         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4002         if (loopCount >= 10) {
4003             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4004         }
4005         if (U_SUCCESS(status)) {
4006             RunMonkey(bi, m, "sent", seed, loopCount, useUText);
4007         }
4008         else {
4009             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4010         }
4011         delete bi;
4012     }
4013
4014 #endif
4015 }
4016
4017 //
4018 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4019 //    Parameters:
4020 //       bi      - the break iterator to use
4021 //       mk      - MonkeyKind, abstraction for obtaining expected results
4022 //       name    - Name of test (char, word, etc.) for use in error messages
4023 //       seed    - Seed for starting random number generator (parameter from user)
4024 //       numIterations
4025 //
4026 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4027                          int32_t numIterations, UBool useUText) {
4028
4029 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4030
4031     const int32_t    TESTSTRINGLEN = 500;
4032     UnicodeString    testText;
4033     int32_t          numCharClasses;
4034     UVector          *chClasses;
4035     int              expectedCount = 0;
4036     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4037     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4038     char             reverseBreaks[TESTSTRINGLEN*2+1];
4039     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4040     char             followingBreaks[TESTSTRINGLEN*2+1];
4041     char             precedingBreaks[TESTSTRINGLEN*2+1];
4042     int              i;
4043     int              loopCount = 0;
4044
4045
4046     m_seed = seed;
4047
4048     numCharClasses = mk.charClasses()->size();
4049     chClasses      = mk.charClasses();
4050
4051     // Check for errors that occured during the construction of the MonkeyKind object.
4052     //  Can't report them where they occured because errln() is a method coming from intlTest,
4053     //  and is not visible outside of RBBITest :-(
4054     if (U_FAILURE(mk.deferredStatus)) {
4055         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4056         return;
4057     }
4058
4059     // Verify that the character classes all have at least one member.
4060     for (i=0; i<numCharClasses; i++) {
4061         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4062         if (s == NULL || s->size() == 0) {
4063             errln("Character Class #%d is null or of zero size.", i);
4064             return;
4065         }
4066     }
4067
4068     // For minimizing width of class name output.
4069     int classNameSize = mk.maxClassNameSize();
4070
4071     while (loopCount < numIterations || numIterations == -1) {
4072         if (numIterations == -1 && loopCount % 10 == 0) {
4073             // If test is running in an infinite loop, display a periodic tic so
4074             //   we can tell that it is making progress.
4075             fprintf(stderr, ".");
4076         }
4077         // Save current random number seed, so that we can recreate the random numbers
4078         //   for this loop iteration in event of an error.
4079         seed = m_seed;
4080
4081         // Populate a test string with data.
4082         testText.truncate(0);
4083         for (i=0; i<TESTSTRINGLEN; i++) {
4084             int32_t  aClassNum = m_rand() % numCharClasses;
4085             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4086             int32_t   charIdx = m_rand() % classSet->size();
4087             UChar32   c = classSet->charAt(charIdx);
4088             if (c < 0) {   // TODO:  deal with sets containing strings.
4089                 errln("%s:%d c < 0", __FILE__, __LINE__);
4090                 break;
4091             }
4092             // Do not assemble a supplementary character from randomly generated separate surrogates.
4093             //   (It could be a dictionary character)
4094             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4095                 continue;
4096             }
4097
4098             testText.append(c);
4099         }
4100
4101         // Calculate the expected results for this test string and reset applied rules.
4102         mk.setText(testText);
4103
4104         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4105         expectedBreaks[0] = 1;
4106         int32_t breakPos = 0;
4107         expectedCount = 0;
4108         for (;;) {
4109             breakPos = mk.next(breakPos);
4110             if (breakPos == -1) {
4111                 break;
4112             }
4113             if (breakPos > testText.length()) {
4114                 errln("breakPos > testText.length()");
4115             }
4116             expectedBreaks[breakPos] = 1;
4117             U_ASSERT(expectedCount<testText.length());
4118         }
4119
4120         // Find the break positions using forward iteration
4121         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4122         if (useUText) {
4123             UErrorCode status = U_ZERO_ERROR;
4124             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4125             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4126             bi->setText(testUText, status);
4127             TEST_ASSERT_SUCCESS(status);
4128             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4129                                       //  This UText can be closed immediately, so long as the
4130                                       //  testText string continues to exist.
4131         } else {
4132             bi->setText(testText);
4133         }
4134
4135         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4136             if (i < 0 || i > testText.length()) {
4137                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4138                 break;
4139             }
4140             forwardBreaks[i] = 1;
4141         }
4142
4143         // Find the break positions using reverse iteration
4144         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4145         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4146             if (i < 0 || i > testText.length()) {
4147                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4148                 break;
4149             }
4150             reverseBreaks[i] = 1;
4151         }
4152
4153         // Find the break positions using isBoundary() tests.
4154         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4155         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4156         for (i=0; i<=testText.length(); i++) {
4157             isBoundaryBreaks[i] = bi->isBoundary(i);
4158         }
4159
4160
4161         // Find the break positions using the following() function.
4162         // printf(".");
4163         memset(followingBreaks, 0, sizeof(followingBreaks));
4164         int32_t   lastBreakPos = 0;
4165         followingBreaks[0] = 1;
4166         for (i=0; i<testText.length(); i++) {
4167             breakPos = bi->following(i);
4168             if (breakPos <= i ||
4169                 breakPos < lastBreakPos ||
4170                 breakPos > testText.length() ||
4171                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4172                 errln("%s break monkey test: "
4173                     "Out of range value returned by BreakIterator::following().\n"
4174                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4175                          name, seed, i, breakPos, lastBreakPos);
4176                 break;
4177             }
4178             followingBreaks[breakPos] = 1;
4179             lastBreakPos = breakPos;
4180         }
4181
4182         // Find the break positions using the preceding() function.
4183         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4184         lastBreakPos = testText.length();
4185         precedingBreaks[testText.length()] = 1;
4186         for (i=testText.length(); i>0; i--) {
4187             breakPos = bi->preceding(i);
4188             if (breakPos >= i ||
4189                 breakPos > lastBreakPos ||
4190                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4191                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4192                 errln("%s break monkey test: "
4193                     "Out of range value returned by BreakIterator::preceding().\n"
4194                     "index=%d;  prev returned %d; lastBreak=%d" ,
4195                     name,  i, breakPos, lastBreakPos);
4196                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4197                     precedingBreaks[i] = 2;   // Forces an error.
4198                 }
4199             } else {
4200                 if (breakPos >= 0) {
4201                     precedingBreaks[breakPos] = 1;
4202                 }
4203                 lastBreakPos = breakPos;
4204             }
4205         }
4206
4207         // Compare the expected and actual results.
4208         for (i=0; i<=testText.length(); i++) {
4209             const char *errorType = NULL;
4210             const char* currentBreakData = NULL;
4211             if  (forwardBreaks[i] != expectedBreaks[i]) {
4212                 errorType = "next()";
4213                 currentBreakData = forwardBreaks;
4214             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4215                 errorType = "previous()";
4216                 currentBreakData = reverseBreaks;
4217            } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4218                 errorType = "isBoundary()";
4219                 currentBreakData = isBoundaryBreaks;
4220             } else if (followingBreaks[i] != expectedBreaks[i]) {
4221                 errorType = "following()";
4222                 currentBreakData = followingBreaks;
4223             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4224                 errorType = "preceding()";
4225                 currentBreakData = precedingBreaks;
4226             }
4227
4228             if (errorType != NULL) {
4229                 // Format a range of the test text that includes the failure as
4230                 //  a data item that can be included in the rbbi test data file.
4231
4232                 // Start of the range is the last point where expected and actual results
4233                 //  both agreed that there was a break position.
4234
4235                 int startContext = i;
4236                 int32_t count = 0;
4237                 for (;;) {
4238                     if (startContext==0) { break; }
4239                     startContext --;
4240                     if (expectedBreaks[startContext] != 0) {
4241                         if (count == 2) break;
4242                         count ++;
4243                     }
4244                 }
4245
4246                 // End of range is two expected breaks past the start position.
4247                 int endContext = i + 1;
4248                 int ci;
4249                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4250                     for (;;) {
4251                         if (endContext >= testText.length()) {break;}
4252                         if (expectedBreaks[endContext-1] != 0) {
4253                             if (count == 0) break;
4254                             count --;
4255                         }
4256                         endContext ++;
4257                     }
4258                 }
4259
4260                 // Formatting of each line includes:
4261                 //   character code
4262                 //   reference break: '|' -> a break, '.' -> no break
4263                 //   actual break:    '|' -> a break, '.' -> no break
4264                 //   (name of character clase)
4265                 //   Unicode name of character
4266                 //   '-->' indicates location of the difference.
4267
4268                 MONKEY_ERROR(
4269                     (expectedBreaks[i] ? "Break expected but not found" :
4270                        "Break found but not expected"),
4271                     name, i, seed);
4272
4273                 for (ci=startContext; (ci = testText.moveIndex32(ci, 1));) {
4274                     UChar32  c;
4275                     c = testText.char32At(ci);
4276
4277                     std::string currentLineFlag = "   ";
4278                     if (ci == i) {
4279                         currentLineFlag = "-->";  // Error position
4280                     }
4281
4282                     // BMP or SMP character in hex
4283                     char hexCodePoint[12];
4284                     std::string format = "    \\u%04x";
4285                     if (c >= 0x10000) {
4286                         format = "\\U%08x";
4287                     }
4288                     sprintf(hexCodePoint, format.c_str(), c);
4289
4290                     // Get the class name and character name for the character.
4291                     char cName[200];
4292                     UErrorCode status = U_ZERO_ERROR;
4293                     u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
4294
4295                     char buffer[200];
4296                     snprintf(buffer, 200,
4297                              "%4s %3i :  %1s  %1s  %10s  %-*s  %-40s  %-40s",
4298                              currentLineFlag.c_str(),
4299                              ci,
4300                              expectedBreaks[ci] == 0 ? "." : "|",  // Reference break
4301                              currentBreakData[ci] == 0 ? "." : "|",  // Actual break
4302                              hexCodePoint,
4303                              classNameSize,
4304                              mk.classNameFromCodepoint(c).c_str(),
4305                              mk.getAppliedRule(ci).c_str(), cName);
4306
4307                     // Output the error
4308                     if (ci == i) {
4309                         errln(buffer);
4310                     } else {
4311                         infoln(buffer);
4312                     }
4313
4314                     if (ci >= endContext) { break; }
4315                 }
4316                 break;
4317             }
4318         }
4319
4320         loopCount++;
4321     }
4322 #endif
4323 }
4324
4325
4326 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4327 //             This test checks the initial patch,
4328 //             which is to just keep it from crashing.  Correct word boundaries
4329 //             await a proper fix to the dictionary code.
4330 //
4331 void RBBITest::TestBug5532(void)  {
4332    // Text includes a mixture of Thai and Latin.
4333    const unsigned char utf8Data[] = {
4334            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4335            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4336            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4337            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4338            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4339            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4340            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4341            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4342            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4343            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4344            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4345
4346     UErrorCode status = U_ZERO_ERROR;
4347     UText utext=UTEXT_INITIALIZER;
4348     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4349     TEST_ASSERT_SUCCESS(status);
4350
4351     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4352     TEST_ASSERT_SUCCESS(status);
4353     if (U_SUCCESS(status)) {
4354         bi->setText(&utext, status);
4355         TEST_ASSERT_SUCCESS(status);
4356
4357         int32_t breakCount = 0;
4358         int32_t previousBreak = -1;
4359         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4360             // For now, just make sure that the break iterator doesn't hang.
4361             TEST_ASSERT(previousBreak < bi->current());
4362             previousBreak = bi->current();
4363         }
4364         TEST_ASSERT(breakCount > 0);
4365     }
4366     delete bi;
4367     utext_close(&utext);
4368 }
4369
4370
4371 void RBBITest::TestBug9983(void)  {
4372     UnicodeString text = UnicodeString("\\u002A"  // * Other
4373                                        "\\uFF65"  //   Other
4374                                        "\\u309C"  //   Katakana
4375                                        "\\uFF9F"  //   Extend
4376                                        "\\uFF65"  //   Other
4377                                        "\\u0020"  //   Other
4378                                        "\\u0000").unescape();
4379
4380     UErrorCode status = U_ZERO_ERROR;
4381     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4382         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4383     TEST_ASSERT_SUCCESS(status);
4384     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4385         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4386     TEST_ASSERT_SUCCESS(status);
4387     if (U_FAILURE(status)) {
4388         return;
4389     }
4390     int32_t offset, rstatus, iterationCount;
4391
4392     brkiter->setText(text);
4393     brkiter->last();
4394     iterationCount = 0;
4395     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4396         iterationCount++;
4397         rstatus = brkiter->getRuleStatus();
4398         (void)rstatus;     // Suppress set but not used warning.
4399         if (iterationCount >= 10) {
4400            break;
4401         }
4402     }
4403     TEST_ASSERT(iterationCount == 6);
4404
4405     brkiterPOSIX->setText(text);
4406     brkiterPOSIX->last();
4407     iterationCount = 0;
4408     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4409         iterationCount++;
4410         rstatus = brkiterPOSIX->getRuleStatus();
4411         (void)rstatus;     // Suppress set but not used warning.
4412         if (iterationCount >= 10) {
4413            break;
4414         }
4415     }
4416     TEST_ASSERT(iterationCount == 6);
4417 }
4418
4419 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4420 //
4421 void RBBITest::TestBug7547() {
4422     UnicodeString rules;
4423     UErrorCode status = U_ZERO_ERROR;
4424     UParseError parseError;
4425     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4426     if (status != U_BRK_RULE_SYNTAX) {
4427         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4428     }
4429     if (parseError.line != 1 || parseError.offset != 0) {
4430         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4431     }
4432 }
4433
4434
4435 void RBBITest::TestBug12797() {
4436     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4437     UErrorCode status = U_ZERO_ERROR;
4438     UParseError parseError;
4439     RuleBasedBreakIterator bi(rules, parseError, status);
4440     if (U_FAILURE(status)) {
4441         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4442         return;
4443     }
4444     UnicodeString text = "abc";
4445     bi.setText(text);
4446     bi.first();
4447     int32_t boundary = bi.next();
4448     if (boundary != 3) {
4449         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4450     }
4451 }
4452
4453 void RBBITest::TestBug12918() {
4454     // This test triggers an assertion failure in dictbe.cpp
4455     const UChar *crasherString = u"\u3325\u4a16";
4456     UErrorCode status = U_ZERO_ERROR;
4457     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4458     if (U_FAILURE(status)) {
4459         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4460         return;
4461     }
4462     ubrk_first(iter);
4463     int32_t pos = 0;
4464     int32_t lastPos = -1;
4465     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4466         if (pos <= lastPos) {
4467             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4468             break;
4469         }
4470     }
4471     ubrk_close(iter);
4472 }
4473
4474 void RBBITest::TestBug12932() {
4475     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4476     UnicodeString ruleStr(
4477             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4478             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4479             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4480             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4481             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4482             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4483
4484     UErrorCode status = U_ZERO_ERROR;
4485     UParseError parseError;
4486     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4487     if (status != U_BRK_RULE_SYNTAX) {
4488         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4489                 __FILE__, __LINE__, u_errorName(status));
4490     }
4491 }
4492
4493
4494 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4495 //             remain undevided by ICU char, word and line break.
4496 void RBBITest::TestEmoji() {
4497 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4498     UErrorCode  status = U_ZERO_ERROR;
4499
4500     CharString testFileName;
4501     testFileName.append(IntlTest::getSourceTestData(status), status);
4502     testFileName.appendPathPart("emoji-test.txt", status);
4503     if (U_FAILURE(status)) {
4504         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4505         return;
4506     }
4507     logln("Opening data file %s\n", testFileName.data());
4508
4509     int    len;
4510     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4511     if (U_FAILURE(status) || testFile == NULL) {
4512         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4513         return;
4514     }
4515     UnicodeString testFileAsString(testFile, len);
4516     delete [] testFile;
4517
4518     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4519     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4520     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4521     int32_t lineNumber = 0;
4522
4523     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4524     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4525     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4526     if (U_FAILURE(status)) {
4527         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4528         return;
4529     }
4530
4531     while (lineMatcher.find()) {
4532         ++lineNumber;
4533         UnicodeString line = lineMatcher.group(status);
4534         hexMatcher.reset(line);
4535         UnicodeString testString;   // accumulates the emoji sequence.
4536         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4537             UnicodeString hex = hexMatcher.group(1, status);
4538             if (hex.length() > 8) {
4539                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4540                 break;
4541             }
4542             CharString hex8;
4543             hex8.appendInvariantChars(hex, status);
4544             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4545             if (c<=0x10ffff) {
4546                 testString.append(c);
4547             } else {
4548                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4549                         __FILE__, __LINE__, lineNumber, hex8.data());
4550                 break;
4551             }
4552         }
4553
4554         if (testString.length() > 1) {
4555             charBreaks->setText(testString);
4556             charBreaks->first();
4557             int32_t firstBreak = charBreaks->next();
4558             if (testString.length() != firstBreak) {
4559                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4560                         __FILE__, __LINE__, lineNumber, firstBreak);
4561             }
4562             wordBreaks->setText(testString);
4563             wordBreaks->first();
4564             firstBreak = wordBreaks->next();
4565             if (testString.length() != firstBreak) {
4566                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4567                         __FILE__, __LINE__, lineNumber, firstBreak);
4568             }
4569             lineBreaks->setText(testString);
4570             lineBreaks->first();
4571             firstBreak = lineBreaks->next();
4572             if (testString.length() != firstBreak) {
4573                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4574                         __FILE__, __LINE__, lineNumber, firstBreak);
4575             }
4576         }
4577     }
4578 #endif
4579 }
4580
4581
4582 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4583
4584 void RBBITest::TestBug12519() {
4585     UErrorCode status = U_ZERO_ERROR;
4586     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4587     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4588     if (!assertSuccess(WHERE, status)) {
4589         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4590         return;
4591     }
4592     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4593
4594     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4595     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4596
4597     LocalPointer<RuleBasedBreakIterator>cloneEn(biEn->clone());
4598     assertTrue(WHERE, *biEn == *cloneEn);
4599     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4600
4601     LocalPointer<RuleBasedBreakIterator>cloneFr(biFr->clone());
4602     assertTrue(WHERE, *biFr == *cloneFr);
4603     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4604
4605     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4606     UnicodeString text("Hallo Welt");
4607     biDe->setText(text);
4608     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4609     *biDe = *biFr;
4610     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4611 }
4612
4613 void RBBITest::TestBug12677() {
4614     // Check that stripping of comments from rules for getRules() is not confused by
4615     // the presence of '#' characters in the rules that do not introduce comments.
4616     UnicodeString rules(u"!!forward; \n"
4617                          "$x = [ab#];  # a set with a # literal. \n"
4618                          " # .;        # a comment that looks sort of like a rule.   \n"
4619                          " '#' '?';    # a rule with a quoted #   \n"
4620                        );
4621
4622     UErrorCode status = U_ZERO_ERROR;
4623     UParseError pe;
4624     RuleBasedBreakIterator bi(rules, pe, status);
4625     assertSuccess(WHERE, status);
4626     UnicodeString rtRules = bi.getRules();
4627     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
4628 }
4629
4630
4631 void RBBITest::TestTableRedundancies() {
4632     UErrorCode status = U_ZERO_ERROR;
4633
4634     LocalPointer<RuleBasedBreakIterator> bi (
4635         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4636     assertSuccess(WHERE, status);
4637     if (U_FAILURE(status)) return;
4638
4639     RBBIDataWrapper *dw = bi->fData;
4640     const RBBIStateTable *fwtbl = dw->fForwardTable;
4641     int32_t numCharClasses = dw->fHeader->fCatCount;
4642     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4643
4644     // Check for duplicate columns (character categories)
4645
4646     std::vector<UnicodeString> columns;
4647     for (int32_t column = 0; column < numCharClasses; column++) {
4648         UnicodeString s;
4649         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4650             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4651             s.append(row->fNextState[column]);
4652         }
4653         columns.push_back(s);
4654     }
4655     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4656     for (int c1=1; c1<numCharClasses; c1++) {
4657         for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4658             if (columns.at(c1) == columns.at(c2)) {
4659                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4660                 goto out;
4661             }
4662         }
4663     }
4664   out:
4665
4666     // Check for duplicate states
4667     std::vector<UnicodeString> rows;
4668     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4669         UnicodeString s;
4670         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4671         assertTrue(WHERE, row->fAccepting >= -1);
4672         s.append(row->fAccepting + 1);   // values of -1 are expected.
4673         s.append(row->fLookAhead);
4674         s.append(row->fTagIdx);
4675         for (int32_t column = 0; column < numCharClasses; column++) {
4676             s.append(row->fNextState[column]);
4677         }
4678         rows.push_back(s);
4679     }
4680     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4681         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4682             if (rows.at(r1) == rows.at(r2)) {
4683                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4684                 return;
4685             }
4686         }
4687     }
4688 }
4689
4690 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4691 //            even after next() has returned DONE.
4692
4693 void RBBITest::TestBug13447() {
4694     UErrorCode status = U_ZERO_ERROR;
4695     LocalPointer<RuleBasedBreakIterator> bi(
4696         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4697     assertSuccess(WHERE, status);
4698     if (U_FAILURE(status)) return;
4699     UnicodeString data(u"1234");
4700     bi->setText(data);
4701     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4702     assertEquals(WHERE, 4, bi->next());
4703     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4704     assertEquals(WHERE, UBRK_DONE, bi->next());
4705     assertEquals(WHERE, 4, bi->current());
4706     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4707 }
4708
4709 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4710 //  for filling the break iterator cache when starting from random positions
4711 //  in the text.
4712 //
4713 //  It's a monkey test, working on random data, with the expected data obtained
4714 //  from forward iteration (no safe rules involved), comparing with results
4715 //  when indexing into the interior of the string (safe rules needed).
4716
4717 void RBBITest::TestReverse() {
4718     UErrorCode status = U_ZERO_ERROR;
4719
4720     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4721             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4722     assertSuccess(WHERE, status, true);
4723     status = U_ZERO_ERROR;
4724     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4725             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4726     assertSuccess(WHERE, status, true);
4727     status = U_ZERO_ERROR;
4728     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4729             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4730     assertSuccess(WHERE, status, true);
4731     status = U_ZERO_ERROR;
4732     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4733             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4734     assertSuccess(WHERE, status, true);
4735 }
4736
4737 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4738     if (!bi) {
4739         return;
4740     }
4741
4742     // From the mapping trie in the break iterator's internal data, create a
4743     // vector of UnicodeStrings, one for each character category, containing
4744     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4745     // to avoid an execess of unassigned code points.
4746
4747     RBBIDataWrapper *data = bi->fData;
4748     int32_t categoryCount = data->fHeader->fCatCount;
4749     UTrie2  *trie = data->fTrie;
4750
4751     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4752     for (int cp=0; cp<0x1fff0; ++cp) {
4753         int cat = utrie2_get32(trie, cp);
4754         cat &= ~0x4000;    // And off the dictionary bit from the category.
4755         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4756         if (cat < 0 || cat >= categoryCount) return;
4757         strings[cat].append(cp);
4758     }
4759
4760     icu_rand randomGen;
4761     const int testStringLength = 10000;
4762     UnicodeString testString;
4763
4764     for (int i=0; i<testStringLength; ++i) {
4765         int charClass = randomGen() % categoryCount;
4766         if (strings[charClass].length() > 0) {
4767             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4768             testString.append(cp);
4769         }
4770     }
4771
4772     typedef std::pair<UBool, int32_t> Result;
4773     std::vector<Result> expectedResults;
4774     bi->setText(testString);
4775     for (int i=0; i<testString.length(); ++i) {
4776         bool isboundary = bi->isBoundary(i);
4777         int  ruleStatus = bi->getRuleStatus();
4778         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4779     }
4780
4781     for (int i=testString.length()-1; i>=0; --i) {
4782         bi->setText(testString);   // clears the internal break cache
4783         Result expected = expectedResults[i];
4784         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4785         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4786     }
4787 }
4788
4789
4790 // Ticket 13692 - finding word boundaries in very large numbers or words could
4791 //                be very time consuming. When the problem was present, this void test
4792 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4793
4794 void RBBITest::TestBug13692() {
4795     UErrorCode status = U_ZERO_ERROR;
4796     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4797             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4798     if (!assertSuccess(WHERE, status, true)) {
4799         return;
4800     }
4801     constexpr int32_t LENGTH = 1000000;
4802     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4803     for (int i=0; i<20; i+=2) {
4804         longNumber.setCharAt(i, u' ');
4805     }
4806     bi->setText(longNumber);
4807     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4808     assertSuccess(WHERE, status);
4809 }
4810
4811 //
4812 //  TestDebug    -  A place-holder test for debugging purposes.
4813 //                  For putting in fragments of other tests that can be invoked
4814 //                  for tracing  without a lot of unwanted extra stuff happening.
4815 //
4816 void RBBITest::TestDebug(void) {
4817     UErrorCode status = U_ZERO_ERROR;
4818     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4819             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4820     if (!assertSuccess(WHERE, status, true)) {
4821         return;
4822     }
4823     const UnicodeString &rules = bi->getRules();
4824     UParseError pe;
4825     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4826     assertSuccess(WHERE, status);
4827 }
4828
4829 void RBBITest::TestProperties() {
4830     UErrorCode errorCode = U_ZERO_ERROR;
4831     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4832     if (!prependSet.isEmpty()) {
4833         errln(
4834             "[:GCB=Prepend:] is not empty any more. "
4835             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4836             "change this test to the opposite condition.");
4837     }
4838 }
4839
4840 #endif // #if !UCONFIG_NO_BREAK_ITERATION