icuSources/test/intltest/rbbitst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1999-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /************************************************************************
   9 *   Date        Name        Description
  10 *   12/15/99    Madhu        Creation.
  11 *   01/12/2000  Madhu        Updated for changed API and added new tests
  12 ************************************************************************/
  13
  14 #include "unicode/utypes.h"
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20 #include <utility>
  21 #include <vector>
  22
  23 #include "unicode/brkiter.h"
  24 #include "unicode/localpointer.h"
  25 #include "unicode/numfmt.h"
  26 #include "unicode/rbbi.h"
  27 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  28 #include "unicode/regex.h"
  29 #endif
  30 #include "unicode/schriter.h"
  31 #include "unicode/uchar.h"
  32 #include "unicode/utf16.h"
  33 #include "unicode/ucnv.h"
  34 #include "unicode/uniset.h"
  35 #include "unicode/uscript.h"
  36 #include "unicode/ustring.h"
  37 #include "unicode/utext.h"
  38
  39 #include "charstr.h"
  40 #include "cmemory.h"
  41 #include "cstr.h"
  42 #include "intltest.h"
  43 #include "rbbitst.h"
  44 #include "rbbidata.h"
  45 #include "utypeinfo.h"  // for 'typeid' to work
  46 #include "uvector.h"
  47 #include "uvectr32.h"
  48
  49
  50 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  51 #include "unicode/filteredbrk.h"
  52 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  53
  54 #define TEST_ASSERT(x) {if (!(x)) { \
  55     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  56
  57 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  58     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  59
  60 //---------------------------------------------
  61 // runIndexedTest
  62 //---------------------------------------------
  63
  64
  65 //  Note:  Before adding new tests to this file, check whether the desired test data can
  66 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  67 //         it's much less work than writing a new test, diagnostic output in the event of failures
  68 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  69 //         will run there as well, without additional effort.
  70
  71 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  72 {
  73     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  74     fTestParams = params;
  75
  76     TESTCASE_AUTO_BEGIN;
  77 #if !UCONFIG_NO_FILE_IO
  78     TESTCASE_AUTO(TestBug4153072);
  79 #endif
  80 #if !UCONFIG_NO_FILE_IO
  81     TESTCASE_AUTO(TestUnicodeFiles);
  82 #endif
  83     TESTCASE_AUTO(TestGetAvailableLocales);
  84     TESTCASE_AUTO(TestGetDisplayName);
  85 #if !UCONFIG_NO_FILE_IO
  86     TESTCASE_AUTO(TestEndBehaviour);
  87     TESTCASE_AUTO(TestWordBreaks);
  88     TESTCASE_AUTO(TestWordBoundary);
  89     TESTCASE_AUTO(TestLineBreaks);
  90     TESTCASE_AUTO(TestSentBreaks);
  91     TESTCASE_AUTO(TestExtended);
  92 #endif
  93 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
  94     TESTCASE_AUTO(TestMonkey);
  95 #endif
  96 #if !UCONFIG_NO_FILE_IO
  97     TESTCASE_AUTO(TestBug3818);
  98 #endif
  99     TESTCASE_AUTO(TestDebug);
 100 #if !UCONFIG_NO_FILE_IO
 101     TESTCASE_AUTO(TestBug5775);
 102 #endif
 103     TESTCASE_AUTO(TestBug9983);
 104     TESTCASE_AUTO(TestDictRules);
 105     TESTCASE_AUTO(TestBug5532);
 106     TESTCASE_AUTO(TestBug7547);
 107     TESTCASE_AUTO(TestBug12797);
 108     TESTCASE_AUTO(TestBug12918);
 109     TESTCASE_AUTO(TestBug12932);
 110     TESTCASE_AUTO(TestEmoji);
 111     TESTCASE_AUTO(TestBug12519);
 112     TESTCASE_AUTO(TestBug12677);
 113     TESTCASE_AUTO(TestTableRedundancies);
 114     TESTCASE_AUTO(TestBug13447);
 115     TESTCASE_AUTO(TestReverse);
 116     TESTCASE_AUTO(TestBug13692);
 117     TESTCASE_AUTO_END;
 118 }
 119
 120
 121 //--------------------------------------------------------------------------------------
 122 //
 123 //    RBBITest    constructor and destructor
 124 //
 125 //--------------------------------------------------------------------------------------
 126
 127 RBBITest::RBBITest() {
 128     fTestParams = NULL;
 129 }
 130
 131
 132 RBBITest::~RBBITest() {
 133 }
 134
 135
 136 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 137     UErrorCode status = U_ZERO_ERROR;
 138     char name[100];
 139     printf("code    alpha extend alphanum type word sent line name\n");
 140     int nextExpectedIndex = 0;
 141     utext_setNativeIndex(tstr, 0);
 142     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 143         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 144             printf("------------------------------------------------ %d\n", j);
 145             ++nextExpectedIndex;
 146         }
 147
 148         UChar32 c = utext_next32(tstr);
 149         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 150         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 151                            u_isUAlphabetic(c),
 152                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 153                            u_isalnum(c),
 154                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 155                                                   u_charType(c),
 156                                                   U_SHORT_PROPERTY_NAME),
 157                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 158                                                   u_getIntPropertyValue(c,
 159                                                           UCHAR_WORD_BREAK),
 160                                                   U_SHORT_PROPERTY_NAME),
 161                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 162                                    u_getIntPropertyValue(c,
 163                                            UCHAR_SENTENCE_BREAK),
 164                                    U_SHORT_PROPERTY_NAME),
 165                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 166                                    u_getIntPropertyValue(c,
 167                                            UCHAR_LINE_BREAK),
 168                                    U_SHORT_PROPERTY_NAME),
 169                            name);
 170     }
 171 }
 172
 173
 174 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 175    UErrorCode status = U_ZERO_ERROR;
 176    UText *tstr = NULL;
 177    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 178    if (U_FAILURE(status)) {
 179        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 180        return;
 181     }
 182    printStringBreaks(tstr, expected, expectedCount);
 183    utext_close(tstr);
 184 }
 185
 186
 187 void RBBITest::TestBug3818() {
 188     UErrorCode  status = U_ZERO_ERROR;
 189
 190     // Four Thai words...
 191     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 192                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 193     UnicodeString  thaiStr(thaiWordData);
 194
 195     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 196     if (U_FAILURE(status) || bi == NULL) {
 197         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 198         return;
 199     }
 200     bi->setText(thaiStr);
 201
 202     int32_t  startOfSecondWord = bi->following(1);
 203     if (startOfSecondWord != 4) {
 204         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 205             __FILE__, __LINE__, startOfSecondWord);
 206     }
 207     startOfSecondWord = bi->following(0);
 208     if (startOfSecondWord != 4) {
 209         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 210             __FILE__, __LINE__, startOfSecondWord);
 211     }
 212     delete bi;
 213 }
 214
 215
 216 //---------------------------------------------
 217 //
 218 //     other tests
 219 //
 220 //---------------------------------------------
 221
 222 void RBBITest::TestGetAvailableLocales()
 223 {
 224     int32_t locCount = 0;
 225     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 226
 227     if (locCount == 0)
 228         dataerrln("getAvailableLocales() returned an empty list!");
 229     // Just make sure that it's returning good memory.
 230     int32_t i;
 231     for (i = 0; i < locCount; ++i) {
 232         logln(locList[i].getName());
 233     }
 234 }
 235
 236 //Testing the BreakIterator::getDisplayName() function
 237 void RBBITest::TestGetDisplayName()
 238 {
 239     UnicodeString   result;
 240
 241     BreakIterator::getDisplayName(Locale::getUS(), result);
 242     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 243         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 244                 + result);
 245
 246     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 247     if (result != "French (France)")
 248         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 249                 + result);
 250 }
 251 /**
 252  * Test End Behaviour
 253  * @bug 4068137
 254  */
 255 void RBBITest::TestEndBehaviour()
 256 {
 257     UErrorCode status = U_ZERO_ERROR;
 258     UnicodeString testString("boo.");
 259     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 260     if (U_FAILURE(status))
 261     {
 262         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 263         return;
 264     }
 265     wb->setText(testString);
 266
 267     if (wb->first() != 0)
 268         errln("Didn't get break at beginning of string.");
 269     if (wb->next() != 3)
 270         errln("Didn't get break before period in \"boo.\"");
 271     if (wb->current() != 4 && wb->next() != 4)
 272         errln("Didn't get break at end of string.");
 273     delete wb;
 274 }
 275 /*
 276  * @bug 4153072
 277  */
 278 void RBBITest::TestBug4153072() {
 279     UErrorCode status = U_ZERO_ERROR;
 280     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 281     if (U_FAILURE(status))
 282     {
 283         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 284         return;
 285     }
 286     UnicodeString str("...Hello, World!...");
 287     int32_t begin = 3;
 288     int32_t end = str.length() - 3;
 289     UBool onBoundary;
 290
 291     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 292     iter->adoptText(textIterator);
 293     int index;
 294     // Note: with the switch to UText, there is no way to restrict the
 295     //       iteration range to begin at an index other than zero.
 296     //       String character iterators created with a non-zero bound are
 297     //         treated by RBBI as being empty.
 298     for (index = -1; index < begin + 1; ++index) {
 299         onBoundary = iter->isBoundary(index);
 300         if (index == 0?  !onBoundary : onBoundary) {
 301             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 302                             " and begin index = " + begin);
 303         }
 304     }
 305     delete iter;
 306 }
 307
 308
 309 //
 310 // Test for problem reported by Ashok Matoria on 9 July 2007
 311 //    One.<kSoftHyphen><kSpace>Two.
 312 //
 313 //    Sentence break at start (0) and then on calling next() it breaks at
 314 //   'T' of "Two". Now, at this point if I do next() and
 315 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 316 //
 317 void RBBITest::TestBug5775() {
 318     UErrorCode status = U_ZERO_ERROR;
 319     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 320     TEST_ASSERT_SUCCESS(status);
 321     if (U_FAILURE(status)) {
 322         return;
 323     }
 324 // Check for status first for better handling of no data errors.
 325     TEST_ASSERT(bi != NULL);
 326     if (bi == NULL) {
 327         return;
 328     }
 329
 330     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 331     //               01234      56789
 332     s = s.unescape();
 333     bi->setText(s);
 334     int pos = bi->next();
 335     TEST_ASSERT(pos == 6);
 336     pos = bi->next();
 337     TEST_ASSERT(pos == 10);
 338     pos = bi->previous();
 339     TEST_ASSERT(pos == 6);
 340     delete bi;
 341 }
 342
 343
 344
 345 //------------------------------------------------------------------------------
 346 //
 347 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 348 //
 349 //------------------------------------------------------------------------------
 350
 351 struct TestParams {
 352     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 353                                            //   Changed out whenever test data changes break type.
 354
 355     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 356     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 357     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 358     UVector32       *srcCol;
 359
 360     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 361     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 362     CharString       utf8String;           // UTF-8 form of text to break.
 363
 364     TestParams(UErrorCode &status) : dataToBreak() {
 365         bi               = NULL;
 366         expectedBreaks   = new UVector32(status);
 367         srcLine          = new UVector32(status);
 368         srcCol           = new UVector32(status);
 369         textToBreak      = NULL;
 370         textMap          = new UVector32(status);
 371     }
 372
 373     ~TestParams() {
 374         delete bi;
 375         delete expectedBreaks;
 376         delete srcLine;
 377         delete srcCol;
 378         utext_close(textToBreak);
 379         delete textMap;
 380     }
 381
 382     int32_t getSrcLine(int32_t bp);
 383     int32_t getExpectedBreak(int32_t bp);
 384     int32_t getSrcCol(int32_t bp);
 385
 386     void setUTF16(UErrorCode &status);
 387     void setUTF8(UErrorCode &status);
 388 };
 389
 390 // Append a UnicodeString to a CharString with UTF-8 encoding.
 391 // Substitute any invalid chars.
 392 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 393 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 394     if (U_FAILURE(status)) {
 395         return;
 396     }
 397     int32_t utf8Length;
 398     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 399                        src.getBuffer(), src.length(),   // UTF-16 data
 400                        0xfffd, NULL,                    // Substitution char, number of subs.
 401                        &status);
 402     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 403         return;
 404     }
 405     status = U_ZERO_ERROR;
 406     int32_t capacity;
 407     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 408     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 409                        src.getBuffer(), src.length(),
 410                        0xfffd, NULL, &status);
 411     dest.append(buffer, utf8Length, status);
 412 }
 413
 414
 415 void TestParams::setUTF16(UErrorCode &status) {
 416     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 417     textMap->removeAllElements();
 418     for (int32_t i=0; i<dataToBreak.length(); i++) {
 419         if (i == dataToBreak.getChar32Start(i)) {
 420             textMap->addElement(i, status);
 421         } else {
 422             textMap->addElement(-1, status);
 423         }
 424     }
 425     textMap->addElement(dataToBreak.length(), status);
 426     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 427 }
 428
 429
 430 void TestParams::setUTF8(UErrorCode &status) {
 431     if (U_FAILURE(status)) {
 432         return;
 433     }
 434     utf8String.clear();
 435     CharStringAppend(utf8String, dataToBreak, status);
 436     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 437     if (U_FAILURE(status)) {
 438         return;
 439     }
 440
 441     textMap->removeAllElements();
 442     int32_t utf16Index = 0;
 443     for (;;) {
 444         textMap->addElement(utf16Index, status);
 445         UChar32 c32 = utext_current32(textToBreak);
 446         if (c32 < 0) {
 447             break;
 448         }
 449         utf16Index += U16_LENGTH(c32);
 450         utext_next32(textToBreak);
 451         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 452             textMap->addElement(-1, status);
 453         }
 454     }
 455     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 456 }
 457
 458
 459 int32_t TestParams::getSrcLine(int32_t bp) {
 460     if (bp >= textMap->size()) {
 461         bp = textMap->size() - 1;
 462     }
 463     int32_t i = 0;
 464     for(; bp >= 0 ; --bp) {
 465         // Move to a character boundary if we are not on one already.
 466         i = textMap->elementAti(bp);
 467         if (i >= 0) {
 468             break;
 469         }
 470     }
 471     return srcLine->elementAti(i);
 472 }
 473
 474
 475 int32_t TestParams::getExpectedBreak(int32_t bp) {
 476     if (bp >= textMap->size()) {
 477         return 0;
 478     }
 479     int32_t i = textMap->elementAti(bp);
 480     int32_t retVal = 0;
 481     if (i >= 0) {
 482         retVal = expectedBreaks->elementAti(i);
 483     }
 484     return retVal;
 485 }
 486
 487
 488 int32_t TestParams::getSrcCol(int32_t bp) {
 489     if (bp >= textMap->size()) {
 490         bp = textMap->size() - 1;
 491     }
 492     int32_t i = 0;
 493     for(; bp >= 0; --bp) {
 494         // Move bp to a character boundary if we are not on one already.
 495         i = textMap->elementAti(bp);
 496         if (i >= 0) {
 497             break;
 498         }
 499     }
 500     return srcCol->elementAti(i);
 501 }
 502
 503
 504 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 505     int32_t    bp;
 506     int32_t    prevBP;
 507     int32_t    i;
 508
 509     TEST_ASSERT_SUCCESS(status);
 510     if (U_FAILURE(status)) {
 511         return;
 512     }
 513
 514     if (t->bi == NULL) {
 515         return;
 516     }
 517
 518     t->bi->setText(t->textToBreak, status);
 519     //
 520     //  Run the iterator forward
 521     //
 522     prevBP = -1;
 523     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 524         if (prevBP ==  bp) {
 525             // Fail for lack of forward progress.
 526             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 527                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 528             break;
 529         }
 530
 531         // Check that there we didn't miss an expected break between the last one
 532         //  and this one.
 533         for (i=prevBP+1; i<bp; i++) {
 534             if (t->getExpectedBreak(i) != 0) {
 535                 int expected[] = {0, i};
 536                 printStringBreaks(t->dataToBreak, expected, 2);
 537                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 538                       i, t->getSrcLine(i), t->getSrcCol(i));
 539             }
 540         }
 541
 542         // Check that the break we did find was expected
 543         if (t->getExpectedBreak(bp) == 0) {
 544             int expected[] = {0, bp};
 545             printStringBreaks(t->textToBreak, expected, 2);
 546             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 547                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 548         } else {
 549             // The break was expected.
 550             //   Check that the {nnn} tag value is correct.
 551             int32_t expectedTagVal = t->getExpectedBreak(bp);
 552             if (expectedTagVal == -1) {
 553                 expectedTagVal = 0;
 554             }
 555             int32_t line = t->getSrcLine(bp);
 556             int32_t rs = t->bi->getRuleStatus();
 557             if (rs != expectedTagVal) {
 558                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 559                       "          Actual, Expected status = %4d, %4d",
 560                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 561             }
 562         }
 563
 564         prevBP = bp;
 565     }
 566
 567     // Verify that there were no missed expected breaks after the last one found
 568     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
 569         if (t->getExpectedBreak(i) != 0) {
 570             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 571                       i, t->getSrcLine(i), t->getSrcCol(i));
 572         }
 573     }
 574
 575     //
 576     //  Run the iterator backwards, verify that the same breaks are found.
 577     //
 578     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
 579     bp = t->bi->last();
 580     while (bp != BreakIterator::DONE) {
 581         if (prevBP ==  bp) {
 582             // Fail for lack of progress.
 583             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 584                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 585             break;
 586         }
 587
 588         // Check that we didn't miss an expected break between the last one
 589         //  and this one.  (UVector returns zeros for index out of bounds.)
 590         for (i=prevBP-1; i>bp; i--) {
 591             if (t->getExpectedBreak(i) != 0) {
 592                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 593                       i, t->getSrcLine(i), t->getSrcCol(i));
 594             }
 595         }
 596
 597         // Check that the break we did find was expected
 598         if (t->getExpectedBreak(bp) == 0) {
 599             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 600                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
 601         } else {
 602             // The break was expected.
 603             //   Check that the {nnn} tag value is correct.
 604             int32_t expectedTagVal = t->getExpectedBreak(bp);
 605             if (expectedTagVal == -1) {
 606                 expectedTagVal = 0;
 607             }
 608             int line = t->getSrcLine(bp);
 609             int32_t rs = t->bi->getRuleStatus();
 610             if (rs != expectedTagVal) {
 611                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
 612                       "          Actual, Expected status = %4d, %4d",
 613                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
 614             }
 615         }
 616
 617         prevBP = bp;
 618         bp = t->bi->previous();
 619     }
 620
 621     // Verify that there were no missed breaks prior to the last one found
 622     for (i=prevBP-1; i>=0; i--) {
 623         if (t->getExpectedBreak(i) != 0) {
 624             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 625                       i, t->getSrcLine(i), t->getSrcCol(i));
 626         }
 627     }
 628
 629     // Check isBoundary()
 630     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
 631         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
 632         UBool boundaryFound    = t->bi->isBoundary(i);
 633         if (boundaryExpected != boundaryFound) {
 634             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
 635                   "        Expected, Actual= %s, %s",
 636                   i, t->getSrcLine(i), t->getSrcCol(i),
 637                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
 638         }
 639     }
 640
 641     // Check following()
 642     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
 643         int32_t actualBreak = t->bi->following(i);
 644         int32_t expectedBreak = BreakIterator::DONE;
 645         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
 646             if (t->getExpectedBreak(j) != 0) {
 647                 expectedBreak = j;
 648                 break;
 649             }
 650         }
 651         if (expectedBreak != actualBreak) {
 652             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
 653                   "        Expected, Actual= %d, %d",
 654                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 655         }
 656     }
 657
 658     // Check preceding()
 659     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
 660         int32_t actualBreak = t->bi->preceding(i);
 661         int32_t expectedBreak = BreakIterator::DONE;
 662
 663         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
 664         // preceding(trailing byte) will return the index of some preceding code point,
 665         // not the lead byte of the current code point, even though that has a smaller index.
 666         // Therefore, start looking at the expected break data not at i-1, but at
 667         // the start of code point index - 1.
 668         utext_setNativeIndex(t->textToBreak, i);
 669         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
 670         for (; j >= 0; j--) {
 671             if (t->getExpectedBreak(j) != 0) {
 672                 expectedBreak = j;
 673                 break;
 674             }
 675         }
 676         if (expectedBreak != actualBreak) {
 677             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
 678                   "        Expected, Actual= %d, %d",
 679                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
 680         }
 681     }
 682 }
 683
 684
 685 void RBBITest::TestExtended() {
 686   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
 687   // data driven test closely entangles filtered and regular data.
 688 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
 689     UErrorCode      status  = U_ZERO_ERROR;
 690     Locale          locale("");
 691
 692     TestParams          tp(status);
 693
 694     RegexMatcher      localeMatcher(UnicodeString(u"<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
 695     if (U_FAILURE(status)) {
 696         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
 697     }
 698
 699     //
 700     //  Open and read the test data file.
 701     //
 702     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 703     CharString testFileName(testDataDirectory, -1, status);
 704     testFileName.append("rbbitst.txt", -1, status);
 705
 706     int    len;
 707     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
 708     if (U_FAILURE(status)) {
 709         errln("%s:%d Error %s opening file rbbitst.txt", __FILE__, __LINE__, u_errorName(status));
 710         return;
 711     }
 712
 713     bool skipTest = false; // Skip this test?
 714
 715     //
 716     //  Put the test data into a UnicodeString
 717     //
 718     UnicodeString testString(FALSE, testFile, len);
 719
 720     enum EParseState{
 721         PARSE_COMMENT,
 722         PARSE_TAG,
 723         PARSE_DATA,
 724         PARSE_NUM,
 725         PARSE_RULES
 726     }
 727     parseState = PARSE_TAG;
 728
 729     EParseState savedState = PARSE_TAG;
 730
 731     int32_t    lineNum  = 1;
 732     int32_t    colStart = 0;
 733     int32_t    column   = 0;
 734     int32_t    charIdx  = 0;
 735
 736     int32_t    tagValue = 0;             // The numeric value of a <nnn> tag.
 737
 738     UnicodeString       rules;           // Holds rules from a <rules> ... </rules> block
 739     int32_t             rulesFirstLine;  // Line number of the start of current <rules> block
 740
 741     for (charIdx = 0; charIdx < len; ) {
 742         status = U_ZERO_ERROR;
 743         UChar  c = testString.charAt(charIdx);
 744         charIdx++;
 745         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
 746             // treat CRLF as a unit
 747             c = u'\n';
 748             charIdx++;
 749         }
 750         if (c == u'\n' || c == u'\r') {
 751             lineNum++;
 752             colStart = charIdx;
 753         }
 754         column = charIdx - colStart + 1;
 755
 756         switch (parseState) {
 757         case PARSE_COMMENT:
 758             if (c == u'\n' || c == u'\r') {
 759                 parseState = savedState;
 760             }
 761             break;
 762
 763         case PARSE_TAG:
 764             {
 765             if (c == u'#') {
 766                 parseState = PARSE_COMMENT;
 767                 savedState = PARSE_TAG;
 768                 break;
 769             }
 770             if (u_isUWhiteSpace(c)) {
 771                 break;
 772             }
 773             if (testString.compare(charIdx-1, 6, u"<word>") == 0) {
 774                 delete tp.bi;
 775                 tp.bi = BreakIterator::createWordInstance(locale,  status);
 776                 skipTest = false;
 777                 charIdx += 5;
 778                 break;
 779             }
 780             if (testString.compare(charIdx-1, 6, u"<char>") == 0) {
 781                 delete tp.bi;
 782                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
 783                 skipTest = false;
 784                 charIdx += 5;
 785                 break;
 786             }
 787             if (testString.compare(charIdx-1, 6, u"<line>") == 0) {
 788                 delete tp.bi;
 789                 tp.bi = BreakIterator::createLineInstance(locale,  status);
 790                 skipTest = false;
 791                 charIdx += 5;
 792                 break;
 793             }
 794             if (testString.compare(charIdx-1, 6, u"<sent>") == 0) {
 795                 delete tp.bi;
 796                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
 797                 skipTest = false;
 798                 charIdx += 5;
 799                 break;
 800             }
 801             if (testString.compare(charIdx-1, 7, u"<title>") == 0) {
 802                 delete tp.bi;
 803                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
 804                 charIdx += 6;
 805                 break;
 806             }
 807
 808             if (testString.compare(charIdx-1, 7, u"<rules>") == 0 ||
 809                 testString.compare(charIdx-1, 10, u"<badrules>") == 0) {
 810                 charIdx = testString.indexOf(u'>', charIdx) + 1;
 811                 parseState = PARSE_RULES;
 812                 rules.remove();
 813                 rulesFirstLine = lineNum;
 814                 break;
 815             }
 816
 817             // <locale  loc_name>
 818             localeMatcher.reset(testString);
 819             if (localeMatcher.lookingAt(charIdx-1, status)) {
 820                 UnicodeString localeName = localeMatcher.group(1, status);
 821                 char localeName8[100];
 822                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
 823                 locale = Locale::createFromName(localeName8);
 824                 charIdx += localeMatcher.group(0, status).length() - 1;
 825                 TEST_ASSERT_SUCCESS(status);
 826                 break;
 827             }
 828             if (testString.compare(charIdx-1, 6, u"<data>") == 0) {
 829                 parseState = PARSE_DATA;
 830                 charIdx += 5;
 831                 tp.dataToBreak = "";
 832                 tp.expectedBreaks->removeAllElements();
 833                 tp.srcCol ->removeAllElements();
 834                 tp.srcLine->removeAllElements();
 835                 break;
 836             }
 837
 838             errln("line %d: Tag expected in test file.", lineNum);
 839             parseState = PARSE_COMMENT;
 840             savedState = PARSE_DATA;
 841             goto end_test; // Stop the test.
 842             }
 843             break;
 844
 845         case PARSE_RULES:
 846             if (testString.compare(charIdx-1, 8, u"</rules>") == 0) {
 847                 charIdx += 7;
 848                 parseState = PARSE_TAG;
 849                 delete tp.bi;
 850                 UParseError pe;
 851                 tp.bi = new RuleBasedBreakIterator(rules, pe, status);
 852                 skipTest = U_FAILURE(status);
 853                 if (U_FAILURE(status)) {
 854                     errln("file rbbitst.txt: %d - Error %s creating break iterator from rules.",
 855                         rulesFirstLine + pe.line - 1, u_errorName(status));
 856                 }
 857             } else if (testString.compare(charIdx-1, 11, u"</badrules>") == 0) {
 858                 charIdx += 10;
 859                 parseState = PARSE_TAG;
 860                 UErrorCode ec = U_ZERO_ERROR;
 861                 UParseError pe;
 862                 RuleBasedBreakIterator bi(rules, pe, ec);
 863                 if (U_SUCCESS(ec)) {
 864                     errln("file rbbitst.txt: %d - Expected, but did not get, a failure creating break iterator from rules.",
 865                         rulesFirstLine + pe.line - 1);
 866                 }
 867             } else {
 868                 rules.append(c);
 869             }
 870             break;
 871
 872         case PARSE_DATA:
 873             if (c == u'\u2022') { // u'•'
 874                 int32_t  breakIdx = tp.dataToBreak.length();
 875                 tp.expectedBreaks->setSize(breakIdx+1);
 876                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 877                 tp.srcLine->setSize(breakIdx+1);
 878                 tp.srcLine->setElementAt(lineNum, breakIdx);
 879                 tp.srcCol ->setSize(breakIdx+1);
 880                 tp.srcCol ->setElementAt(column, breakIdx);
 881                 break;
 882             }
 883
 884             if (testString.compare(charIdx-1, 7, u"</data>") == 0) {
 885                 // Add final entry to mappings from break location to source file position.
 886                 //  Need one extra because last break position returned is after the
 887                 //    last char in the data, not at the last char.
 888                 tp.srcLine->addElement(lineNum, status);
 889                 tp.srcCol ->addElement(column, status);
 890
 891                 parseState = PARSE_TAG;
 892                 charIdx += 6;
 893
 894                 if (!skipTest) {
 895                     // RUN THE TEST!
 896                     status = U_ZERO_ERROR;
 897                     tp.setUTF16(status);
 898                     executeTest(&tp, status);
 899                     TEST_ASSERT_SUCCESS(status);
 900
 901                     // Run again, this time with UTF-8 text wrapped in a UText.
 902                     status = U_ZERO_ERROR;
 903                     tp.setUTF8(status);
 904                     TEST_ASSERT_SUCCESS(status);
 905                     executeTest(&tp, status);
 906                 }
 907                 break;
 908             }
 909
 910             if (testString.compare(charIdx-1, 3, u"\\N{") == 0) {
 911                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
 912                 // Get the code point from the name and insert it into the test data.
 913                 //   (Damn, no API takes names in Unicode  !!!
 914                 //    we've got to take it back to char *)
 915                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
 916                 int32_t nameLength = nameEndIdx - (charIdx+2);
 917                 char charNameBuf[200];
 918                 UChar32 theChar = -1;
 919                 if (nameEndIdx != -1) {
 920                     UErrorCode status = U_ZERO_ERROR;
 921                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
 922                     charNameBuf[sizeof(charNameBuf)-1] = 0;
 923                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
 924                     if (U_FAILURE(status)) {
 925                         theChar = -1;
 926                     }
 927                 }
 928                 if (theChar == -1) {
 929                     errln("Error in named character in test file at line %d, col %d",
 930                         lineNum, column);
 931                 } else {
 932                     // Named code point was recognized.  Insert it
 933                     //   into the test data.
 934                     tp.dataToBreak.append(theChar);
 935                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
 936                         tp.srcLine->addElement(lineNum, status);
 937                         tp.srcCol ->addElement(column, status);
 938                     }
 939                 }
 940                 if (nameEndIdx > charIdx) {
 941                     charIdx = nameEndIdx+1;
 942
 943                 }
 944                 break;
 945             }
 946
 947
 948
 949             if (testString.compare(charIdx-1, 2, u"<>") == 0) {
 950                 charIdx++;
 951                 int32_t  breakIdx = tp.dataToBreak.length();
 952                 tp.expectedBreaks->setSize(breakIdx+1);
 953                 tp.expectedBreaks->setElementAt(-1, breakIdx);
 954                 tp.srcLine->setSize(breakIdx+1);
 955                 tp.srcLine->setElementAt(lineNum, breakIdx);
 956                 tp.srcCol ->setSize(breakIdx+1);
 957                 tp.srcCol ->setElementAt(column, breakIdx);
 958                 break;
 959             }
 960
 961             if (c == u'<') {
 962                 tagValue   = 0;
 963                 parseState = PARSE_NUM;
 964                 break;
 965             }
 966
 967             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
 968                 parseState = PARSE_COMMENT;
 969                 savedState = PARSE_DATA;
 970                 break;
 971             }
 972
 973             if (c == u'\\') {
 974                 // Check for \ at end of line, a line continuation.
 975                 //     Advance over (discard) the newline
 976                 UChar32 cp = testString.char32At(charIdx);
 977                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
 978                     // We have a CR LF
 979                     //  Need an extra increment of the input ptr to move over both of them
 980                     charIdx++;
 981                 }
 982                 if (cp == u'\n' || cp == u'\r') {
 983                     lineNum++;
 984                     colStart = charIdx;
 985                     charIdx++;
 986                     break;
 987                 }
 988
 989                 // Let unescape handle the back slash.
 990                 cp = testString.unescapeAt(charIdx);
 991                 if (cp != -1) {
 992                     // Escape sequence was recognized.  Insert the char
 993                     //   into the test data.
 994                     tp.dataToBreak.append(cp);
 995                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
 996                         tp.srcLine->addElement(lineNum, status);
 997                         tp.srcCol ->addElement(column, status);
 998                     }
 999                     break;
1000                 }
1001
1002
1003                 // Not a recognized backslash escape sequence.
1004                 // Take the next char as a literal.
1005                 //  TODO:  Should this be an error?
1006                 c = testString.charAt(charIdx);
1007                 charIdx = testString.moveIndex32(charIdx, 1);
1008             }
1009
1010             // Normal, non-escaped data char.
1011             tp.dataToBreak.append(c);
1012
1013             // Save the mapping from offset in the data to line/column numbers in
1014             //   the original input file.  Will be used for better error messages only.
1015             //   If there's an expected break before this char, the slot in the mapping
1016             //     vector will already be set for this char; don't overwrite it.
1017             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1018                 tp.srcLine->addElement(lineNum, status);
1019                 tp.srcCol ->addElement(column, status);
1020             }
1021             break;
1022
1023
1024         case PARSE_NUM:
1025             // We are parsing an expected numeric tag value, like <1234>,
1026             //   within a chunk of data.
1027             if (u_isUWhiteSpace(c)) {
1028                 break;
1029             }
1030
1031             if (c == u'>') {
1032                 // Finished the number.  Add the info to the expected break data,
1033                 //   and switch parse state back to doing plain data.
1034                 parseState = PARSE_DATA;
1035                 if (tagValue == 0) {
1036                     tagValue = -1;
1037                 }
1038                 int32_t  breakIdx = tp.dataToBreak.length();
1039                 tp.expectedBreaks->setSize(breakIdx+1);
1040                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1041                 tp.srcLine->setSize(breakIdx+1);
1042                 tp.srcLine->setElementAt(lineNum, breakIdx);
1043                 tp.srcCol ->setSize(breakIdx+1);
1044                 tp.srcCol ->setElementAt(column, breakIdx);
1045                 break;
1046             }
1047
1048             if (u_isdigit(c)) {
1049                 tagValue = tagValue*10 + u_charDigitValue(c);
1050                 break;
1051             }
1052
1053             errln("Syntax Error in test file at line %d, col %d",
1054                 lineNum, column);
1055             parseState = PARSE_COMMENT;
1056             goto end_test; // Stop the test
1057             break;
1058         }
1059
1060
1061         if (U_FAILURE(status)) {
1062             dataerrln("ICU Error %s while parsing test file at line %d.",
1063                 u_errorName(status), lineNum);
1064             status = U_ZERO_ERROR;
1065             goto end_test; // Stop the test
1066         }
1067
1068     }
1069
1070     // Reached end of test file. Raise an error if parseState indicates that we are
1071     //   within a block that should have been terminated.
1072
1073     if (parseState == PARSE_RULES) {
1074         errln("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
1075             lineNum, rulesFirstLine);
1076     }
1077     if (parseState == PARSE_DATA) {
1078         errln("rbbitst.txt:%d <data> block not closed.", lineNum);
1079     }
1080
1081
1082 end_test:
1083     delete [] testFile;
1084 #endif
1085 }
1086
1087
1088 //-------------------------------------------------------------------------------
1089 //
1090 //  TestDictRules   create a break iterator from source rules that includes a
1091 //                  dictionary range.   Regression for bug #7130.  Source rules
1092 //                  do not declare a break iterator type (word, line, sentence, etc.
1093 //                  but the dictionary code, without a type, would loop.
1094 //
1095 //-------------------------------------------------------------------------------
1096 void RBBITest::TestDictRules() {
1097     const char *rules =  "$dictionary = [a-z]; \n"
1098                          "!!forward; \n"
1099                          "$dictionary $dictionary; \n"
1100                          "!!reverse; \n"
1101                          "$dictionary $dictionary; \n";
1102     const char *text = "aa";
1103     UErrorCode status = U_ZERO_ERROR;
1104     UParseError parseError;
1105
1106     RuleBasedBreakIterator bi(rules, parseError, status);
1107     if (U_SUCCESS(status)) {
1108         UnicodeString utext = text;
1109         bi.setText(utext);
1110         int32_t position;
1111         int32_t loops;
1112         for (loops = 0; loops<10; loops++) {
1113             position = bi.next();
1114             if (position == RuleBasedBreakIterator::DONE) {
1115                 break;
1116             }
1117         }
1118         TEST_ASSERT(loops == 1);
1119     } else {
1120         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1121     }
1122 }
1123
1124
1125
1126 //-------------------------------------------------------------------------------
1127 //
1128 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1129 //    return the data in one big UChar * buffer, which the caller must delete.
1130 //
1131 //    parameters:
1132 //          fileName:   the name of the file, with no directory part.  The test data directory
1133 //                      is assumed.
1134 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1135 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1136 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1137 //                      Pass NULL for the system default encoding.
1138 //          status
1139 //    returns:
1140 //                      The file data, converted to UChar.
1141 //                      The caller must delete this when done with
1142 //                           delete [] theBuffer;
1143 //
1144 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1145 //           Move this function to some common place.
1146 //
1147 //--------------------------------------------------------------------------------
1148 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1149     UChar       *retPtr  = NULL;
1150     char        *fileBuf = NULL;
1151     UConverter* conv     = NULL;
1152     FILE        *f       = NULL;
1153
1154     ulen = 0;
1155     if (U_FAILURE(status)) {
1156         return retPtr;
1157     }
1158
1159     //
1160     //  Open the file.
1161     //
1162     f = fopen(fileName, "rb");
1163     if (f == 0) {
1164         dataerrln("Error opening test data file %s\n", fileName);
1165         status = U_FILE_ACCESS_ERROR;
1166         return NULL;
1167     }
1168     //
1169     //  Read it in
1170     //
1171     int   fileSize;
1172     int   amt_read;
1173
1174     fseek( f, 0, SEEK_END);
1175     fileSize = ftell(f);
1176     fileBuf = new char[fileSize];
1177     fseek(f, 0, SEEK_SET);
1178     amt_read = fread(fileBuf, 1, fileSize, f);
1179     if (amt_read != fileSize || fileSize <= 0) {
1180         errln("Error reading test data file.");
1181         goto cleanUpAndReturn;
1182     }
1183
1184     //
1185     // Look for a Unicode Signature (BOM) on the data just read
1186     //
1187     int32_t        signatureLength;
1188     const char *   fileBufC;
1189     const char*    bomEncoding;
1190
1191     fileBufC = fileBuf;
1192     bomEncoding = ucnv_detectUnicodeSignature(
1193         fileBuf, fileSize, &signatureLength, &status);
1194     if(bomEncoding!=NULL ){
1195         fileBufC  += signatureLength;
1196         fileSize  -= signatureLength;
1197         encoding = bomEncoding;
1198     }
1199
1200     //
1201     // Open a converter to take the rule file to UTF-16
1202     //
1203     conv = ucnv_open(encoding, &status);
1204     if (U_FAILURE(status)) {
1205         goto cleanUpAndReturn;
1206     }
1207
1208     //
1209     // Convert the rules to UChar.
1210     //  Preflight first to determine required buffer size.
1211     //
1212     ulen = ucnv_toUChars(conv,
1213         NULL,           //  dest,
1214         0,              //  destCapacity,
1215         fileBufC,
1216         fileSize,
1217         &status);
1218     if (status == U_BUFFER_OVERFLOW_ERROR) {
1219         // Buffer Overflow is expected from the preflight operation.
1220         status = U_ZERO_ERROR;
1221
1222         retPtr = new UChar[ulen+1];
1223         ucnv_toUChars(conv,
1224             retPtr,       //  dest,
1225             ulen+1,
1226             fileBufC,
1227             fileSize,
1228             &status);
1229     }
1230
1231 cleanUpAndReturn:
1232     fclose(f);
1233     delete []fileBuf;
1234     ucnv_close(conv);
1235     if (U_FAILURE(status)) {
1236         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1237         delete []retPtr;
1238         retPtr = 0;
1239         ulen   = 0;
1240     };
1241     return retPtr;
1242 }
1243
1244
1245
1246 //--------------------------------------------------------------------------------------------
1247 //
1248 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1249 //
1250 //-------------------------------------------------------------------------------------------
1251 void RBBITest::TestUnicodeFiles() {
1252     RuleBasedBreakIterator  *bi;
1253     UErrorCode               status = U_ZERO_ERROR;
1254
1255     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1256     TEST_ASSERT_SUCCESS(status);
1257     if (U_SUCCESS(status)) {
1258         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1259     }
1260     delete bi;
1261
1262     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1263     TEST_ASSERT_SUCCESS(status);
1264     if (U_SUCCESS(status)) {
1265         runUnicodeTestData("WordBreakTest.txt", bi);
1266     }
1267     delete bi;
1268
1269     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1270     TEST_ASSERT_SUCCESS(status);
1271     if (U_SUCCESS(status)) {
1272         runUnicodeTestData("SentenceBreakTest.txt", bi);
1273     }
1274     delete bi;
1275
1276     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1277     TEST_ASSERT_SUCCESS(status);
1278     if (U_SUCCESS(status)) {
1279         runUnicodeTestData("LineBreakTest.txt", bi);
1280     }
1281     delete bi;
1282 }
1283
1284
1285 // Check for test cases from the Unicode test data files that are known to fail
1286 // and should be skipped because ICU is not yet able to fully implement the spec.
1287 // See ticket #7270.
1288
1289 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1290     static struct TestCase {
1291         const char *fFileName;
1292         const UChar *fString;
1293     } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
1294         {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
1295         {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
1296         {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
1297         {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
1298         {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
1299         {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
1300                                                         // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1301         {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
1302         {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1303         {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1304
1305                                                         // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1306         {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
1307         {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
1308     };
1309
1310     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1311         const TestCase &badCase = badTestCases[n];
1312         if (!strcmp(fileName, badCase.fFileName) &&
1313                 testCase == UnicodeString(badCase.fString)) {
1314             return logKnownIssue("7270");
1315         }
1316     }
1317     return FALSE;
1318 }
1319
1320
1321 //--------------------------------------------------------------------------------------------
1322 //
1323 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1324 //
1325 //-------------------------------------------------------------------------------------------
1326 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1327 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1328     UErrorCode  status = U_ZERO_ERROR;
1329
1330     //
1331     //  Open and read the test data file, put it into a UnicodeString.
1332     //
1333     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1334     char testFileName[1000];
1335     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1336         dataerrln("Can't open test data.  Path too long.");
1337         return;
1338     }
1339     strcpy(testFileName, testDataDirectory);
1340     strcat(testFileName, fileName);
1341
1342     logln("Opening data file %s\n", fileName);
1343
1344     int    len;
1345     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1346     if (status != U_FILE_ACCESS_ERROR) {
1347         TEST_ASSERT_SUCCESS(status);
1348         TEST_ASSERT(testFile != NULL);
1349     }
1350     if (U_FAILURE(status) || testFile == NULL) {
1351         return; /* something went wrong, error already output */
1352     }
1353     UnicodeString testFileAsString(TRUE, testFile, len);
1354
1355     //
1356     //  Parse the test data file using a regular expression.
1357     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1358     //     is identified by which group had a match.
1359     //
1360     //    Caputure Group #                  1          2            3            4           5
1361     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1362     //
1363     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1364     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1365     UnicodeString   testString;
1366     UVector32       breakPositions(status);
1367     int             lineNumber = 1;
1368     TEST_ASSERT_SUCCESS(status);
1369     if (U_FAILURE(status)) {
1370         return;
1371     }
1372
1373     //
1374     //  Scan through each test case, building up the string to be broken in testString,
1375     //   and the positions that should be boundaries in the breakPositions vector.
1376     //
1377     int spin = 0;
1378     while (tokenMatcher.find()) {
1379         if(tokenMatcher.hitEnd()) {
1380           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1381              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1382              and caused an infinite loop here on EBCDIC systems!
1383           */
1384           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1385           //       return;
1386         }
1387         if (tokenMatcher.start(1, status) >= 0) {
1388             // Scanned a divide sign, indicating a break position in the test data.
1389             if (testString.length()>0) {
1390                 breakPositions.addElement(testString.length(), status);
1391             }
1392         }
1393         else if (tokenMatcher.start(2, status) >= 0) {
1394             // Scanned an 'x', meaning no break at this position in the test data
1395             //   Nothing to be done here.
1396             }
1397         else if (tokenMatcher.start(3, status) >= 0) {
1398             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1399             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1400             int length = hexNumber.length();
1401             if (length<=8) {
1402                 char buf[10];
1403                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1404                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1405                 if (c<=0x10ffff) {
1406                     testString.append(c);
1407                 } else {
1408                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1409                        fileName, lineNumber);
1410                 }
1411             } else {
1412                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1413                        fileName, lineNumber);
1414              }
1415         }
1416         else if (tokenMatcher.start(4, status) >= 0) {
1417             // Scanned to end of a line, possibly skipping over a comment in the process.
1418             //   If the line from the file contained test data, run the test now.
1419             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1420                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1421             }
1422
1423             // Clear out this test case.
1424             //    The string and breakPositions vector will be refilled as the next
1425             //       test case is parsed.
1426             testString.remove();
1427             breakPositions.removeAllElements();
1428             lineNumber++;
1429         } else {
1430             // Scanner catchall.  Something unrecognized appeared on the line.
1431             char token[16];
1432             UnicodeString uToken = tokenMatcher.group(0, status);
1433             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1434             token[sizeof(token)-1] = 0;
1435             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1436
1437             // Clean up, in preparation for continuing with the next line.
1438             testString.remove();
1439             breakPositions.removeAllElements();
1440             lineNumber++;
1441         }
1442         TEST_ASSERT_SUCCESS(status);
1443         if (U_FAILURE(status)) {
1444             break;
1445         }
1446     }
1447
1448     delete [] testFile;
1449  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1450 }
1451
1452 //--------------------------------------------------------------------------------------------
1453 //
1454 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1455 //                            test data files.  Do only a simple, forward-only check -
1456 //                            this test is mostly to check that ICU and the Unicode
1457 //                            data agree with each other.
1458 //
1459 //--------------------------------------------------------------------------------------------
1460 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1461                          const UnicodeString &testString,   // Text data to be broken
1462                          UVector32 *breakPositions,         // Positions where breaks should be found.
1463                          RuleBasedBreakIterator *bi) {
1464     int32_t pos;                 // Break Position in the test string
1465     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1466     int32_t expectedPos;         // Expected break position (index into test string)
1467
1468     bi->setText(testString);
1469     pos = bi->first();
1470     pos = bi->next();
1471
1472     while (pos != BreakIterator::DONE) {
1473         if (expectedI >= breakPositions->size()) {
1474             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1475                 testFileName, lineNumber, pos);
1476             break;
1477         }
1478         expectedPos = breakPositions->elementAti(expectedI);
1479         if (pos < expectedPos) {
1480             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1481                 testFileName, lineNumber, pos);
1482             break;
1483         }
1484         if (pos > expectedPos) {
1485             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1486                 testFileName, lineNumber, expectedPos);
1487             break;
1488         }
1489         pos = bi->next();
1490         expectedI++;
1491     }
1492
1493     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1494         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1495             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1496     }
1497 }
1498
1499
1500
1501 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1502 //---------------------------------------------------------------------------------------
1503 //
1504 //   classs RBBIMonkeyKind
1505 //
1506 //      Monkey Test for Break Iteration
1507 //      Abstract interface class.   Concrete derived classes independently
1508 //      implement the break rules for different iterator types.
1509 //
1510 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1511 //      testing, but works purely in terms of the interface defined here.
1512 //
1513 //---------------------------------------------------------------------------------------
1514 class RBBIMonkeyKind {
1515 public:
1516     // Return a UVector of UnicodeSets, representing the character classes used
1517     //   for this type of iterator.
1518     virtual  UVector  *charClasses() = 0;
1519
1520     // Set the test text on which subsequent calls to next() will operate
1521     virtual  void      setText(const UnicodeString &s) = 0;
1522
1523     // Find the next break postion, starting from the prev break position, or from zero.
1524     // Return -1 after reaching end of string.
1525     virtual  int32_t   next(int32_t i) = 0;
1526
1527     virtual ~RBBIMonkeyKind();
1528     UErrorCode       deferredStatus;
1529
1530
1531 protected:
1532     RBBIMonkeyKind();
1533
1534 private:
1535 };
1536
1537 RBBIMonkeyKind::RBBIMonkeyKind() {
1538     deferredStatus = U_ZERO_ERROR;
1539 }
1540
1541 RBBIMonkeyKind::~RBBIMonkeyKind() {
1542 }
1543
1544
1545 //----------------------------------------------------------------------------------------
1546 //
1547 //   Random Numbers.  Similar to standard lib rand() and srand()
1548 //                    Not using library to
1549 //                      1.  Get same results on all platforms.
1550 //                      2.  Get access to current seed, to more easily reproduce failures.
1551 //
1552 //---------------------------------------------------------------------------------------
1553 static uint32_t m_seed = 1;
1554
1555 static uint32_t m_rand()
1556 {
1557     m_seed = m_seed * 1103515245 + 12345;
1558     return (uint32_t)(m_seed/65536) % 32768;
1559 }
1560
1561
1562 //------------------------------------------------------------------------------------------
1563 //
1564 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1565 //                             of RBBIMonkeyKind.
1566 //
1567 //------------------------------------------------------------------------------------------
1568 class RBBICharMonkey: public RBBIMonkeyKind {
1569 public:
1570     RBBICharMonkey();
1571     virtual          ~RBBICharMonkey();
1572     virtual  UVector *charClasses();
1573     virtual  void     setText(const UnicodeString &s);
1574     virtual  int32_t  next(int32_t i);
1575 private:
1576     UVector   *fSets;
1577
1578     UnicodeSet  *fCRLFSet;
1579     UnicodeSet  *fControlSet;
1580     UnicodeSet  *fExtendSet;
1581     UnicodeSet  *fZWJSet;
1582     UnicodeSet  *fRegionalIndicatorSet;
1583     UnicodeSet  *fPrependSet;
1584     UnicodeSet  *fSpacingSet;
1585     UnicodeSet  *fLSet;
1586     UnicodeSet  *fVSet;
1587     UnicodeSet  *fTSet;
1588     UnicodeSet  *fLVSet;
1589     UnicodeSet  *fLVTSet;
1590     UnicodeSet  *fHangulSet;
1591     UnicodeSet  *fExtendedPictSet;
1592     UnicodeSet  *fAnySet;
1593
1594     const UnicodeString *fText;
1595 };
1596
1597
1598 RBBICharMonkey::RBBICharMonkey() {
1599     UErrorCode  status = U_ZERO_ERROR;
1600
1601     fText = NULL;
1602
1603     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1604     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
1605     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
1606     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
1607     fRegionalIndicatorSet =
1608                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1609     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1610     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1611     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1612     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1613     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1614     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1615     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1616     fHangulSet  = new UnicodeSet();
1617     fHangulSet->addAll(*fLSet);
1618     fHangulSet->addAll(*fVSet);
1619     fHangulSet->addAll(*fTSet);
1620     fHangulSet->addAll(*fLVSet);
1621     fHangulSet->addAll(*fLVTSet);
1622
1623     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1624     fAnySet           = new UnicodeSet(0, 0x10ffff);
1625
1626     fSets             = new UVector(status);
1627     fSets->addElement(fCRLFSet,    status);
1628     fSets->addElement(fControlSet, status);
1629     fSets->addElement(fExtendSet,  status);
1630     fSets->addElement(fRegionalIndicatorSet, status);
1631     if (!fPrependSet->isEmpty()) {
1632         fSets->addElement(fPrependSet, status);
1633     }
1634     fSets->addElement(fSpacingSet, status);
1635     fSets->addElement(fHangulSet,  status);
1636     fSets->addElement(fAnySet,     status);
1637     fSets->addElement(fZWJSet,     status);
1638     fSets->addElement(fExtendedPictSet, status);
1639     if (U_FAILURE(status)) {
1640         deferredStatus = status;
1641     }
1642 }
1643
1644
1645 void RBBICharMonkey::setText(const UnicodeString &s) {
1646     fText = &s;
1647 }
1648
1649
1650
1651 int32_t RBBICharMonkey::next(int32_t prevPos) {
1652     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1653                               //   break position being tested.  The candidate break
1654                               //   location is before p2.
1655
1656     int     breakPos = -1;
1657
1658     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1659     UChar32 cBase;            // for (X Extend*) patterns, the X character.
1660
1661     if (U_FAILURE(deferredStatus)) {
1662         return -1;
1663     }
1664
1665     // Previous break at end of string.  return DONE.
1666     if (prevPos >= fText->length()) {
1667         return -1;
1668     }
1669     p0 = p1 = p2 = p3 = prevPos;
1670     c3 =  fText->char32At(prevPos);
1671     c0 = c1 = c2 = cBase = 0;
1672     (void)p0;   // suppress set but not used warning.
1673     (void)c0;
1674
1675     // Loop runs once per "significant" character position in the input text.
1676     for (;;) {
1677         // Move all of the positions forward in the input string.
1678         p0 = p1;  c0 = c1;
1679         p1 = p2;  c1 = c2;
1680         p2 = p3;  c2 = c3;
1681
1682         // Advancd p3 by one codepoint
1683         p3 = fText->moveIndex32(p3, 1);
1684         c3 = fText->char32At(p3);
1685
1686         if (p1 == p2) {
1687             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1688             continue;
1689         }
1690         if (p2 == fText->length()) {
1691             // Reached end of string.  Always a break position.
1692             break;
1693         }
1694
1695         // Rule  GB3   CR x LF
1696         //     No Extend or Format characters may appear between the CR and LF,
1697         //     which requires the additional check for p2 immediately following p1.
1698         //
1699         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1700             continue;
1701         }
1702
1703         // Rule (GB4).   ( Control | CR | LF ) <break>
1704         if (fControlSet->contains(c1) ||
1705             c1 == 0x0D ||
1706             c1 == 0x0A)  {
1707             break;
1708         }
1709
1710         // Rule (GB5)    <break>  ( Control | CR | LF )
1711         //
1712         if (fControlSet->contains(c2) ||
1713             c2 == 0x0D ||
1714             c2 == 0x0A)  {
1715             break;
1716         }
1717
1718
1719         // Rule (GB6)  L x ( L | V | LV | LVT )
1720         if (fLSet->contains(c1) &&
1721                (fLSet->contains(c2)  ||
1722                 fVSet->contains(c2)  ||
1723                 fLVSet->contains(c2) ||
1724                 fLVTSet->contains(c2))) {
1725             continue;
1726         }
1727
1728         // Rule (GB7)    ( LV | V )  x  ( V | T )
1729         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1730             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1731             continue;
1732         }
1733
1734         // Rule (GB8)    ( LVT | T)  x T
1735         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1736             fTSet->contains(c2))  {
1737             continue;
1738         }
1739
1740         // Rule (GB9)    x (Extend | ZWJ)
1741         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
1742             if (!fExtendSet->contains(c1)) {
1743                 cBase = c1;
1744             }
1745             continue;
1746         }
1747
1748         // Rule (GB9a)   x  SpacingMark
1749         if (fSpacingSet->contains(c2)) {
1750             continue;
1751         }
1752
1753         // Rule (GB9b)   Prepend x
1754         if (fPrependSet->contains(c1)) {
1755             continue;
1756         }
1757
1758         // Rule (GB11)   Extended_Pictographic Extend * ZWJ x Extended_Pictographic
1759         if (fExtendedPictSet->contains(cBase) && fZWJSet->contains(c1) && fExtendedPictSet->contains(c2)) {
1760             continue;
1761         }
1762
1763         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
1764         //                   Note: The first if condition is a little tricky. We only need to force
1765         //                      a break if there are three or more contiguous RIs. If there are
1766         //                      only two, a break following will occur via other rules, and will include
1767         //                      any trailing extend characters, which is needed behavior.
1768         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
1769                 && fRegionalIndicatorSet->contains(c2)) {
1770             break;
1771         }
1772         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1773             continue;
1774         }
1775
1776         // Rule (GB999)  Any  <break>  Any
1777         break;
1778     }
1779
1780     breakPos = p2;
1781     return breakPos;
1782 }
1783
1784
1785
1786 UVector  *RBBICharMonkey::charClasses() {
1787     return fSets;
1788 }
1789
1790
1791 RBBICharMonkey::~RBBICharMonkey() {
1792     delete fSets;
1793     delete fCRLFSet;
1794     delete fControlSet;
1795     delete fExtendSet;
1796     delete fRegionalIndicatorSet;
1797     delete fPrependSet;
1798     delete fSpacingSet;
1799     delete fLSet;
1800     delete fVSet;
1801     delete fTSet;
1802     delete fLVSet;
1803     delete fLVTSet;
1804     delete fHangulSet;
1805     delete fAnySet;
1806     delete fZWJSet;
1807     delete fExtendedPictSet;
1808 }
1809
1810 //------------------------------------------------------------------------------------------
1811 //
1812 //   class RBBIWordMonkey      Word Break specific implementation
1813 //                             of RBBIMonkeyKind.
1814 //
1815 //------------------------------------------------------------------------------------------
1816 class RBBIWordMonkey: public RBBIMonkeyKind {
1817 public:
1818     RBBIWordMonkey();
1819     virtual          ~RBBIWordMonkey();
1820     virtual  UVector *charClasses();
1821     virtual  void     setText(const UnicodeString &s);
1822     virtual int32_t   next(int32_t i);
1823 private:
1824     UVector      *fSets;
1825
1826     UnicodeSet  *fCRSet;
1827     UnicodeSet  *fLFSet;
1828     UnicodeSet  *fNewlineSet;
1829     UnicodeSet  *fRegionalIndicatorSet;
1830     UnicodeSet  *fKatakanaSet;
1831     UnicodeSet  *fHebrew_LetterSet;
1832     UnicodeSet  *fALetterSet;
1833     UnicodeSet  *fSingle_QuoteSet;
1834     UnicodeSet  *fDouble_QuoteSet;
1835     UnicodeSet  *fMidNumLetSet;
1836     UnicodeSet  *fMidLetterSet;
1837     UnicodeSet  *fMidNumSet;
1838     UnicodeSet  *fNumericSet;
1839     UnicodeSet  *fFormatSet;
1840     UnicodeSet  *fOtherSet;
1841     UnicodeSet  *fExtendSet;
1842     UnicodeSet  *fExtendNumLetSet;
1843     UnicodeSet  *fWSegSpaceSet;
1844     UnicodeSet  *fDictionarySet;
1845     UnicodeSet  *fZWJSet;
1846     UnicodeSet  *fExtendedPictSet;
1847
1848     const UnicodeString  *fText;
1849 };
1850
1851
1852 RBBIWordMonkey::RBBIWordMonkey()
1853 {
1854     UErrorCode  status = U_ZERO_ERROR;
1855
1856     fSets            = new UVector(status);
1857
1858     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
1859     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
1860     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
1861     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
1862     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
1863     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
1864     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
1865     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
1866     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
1867     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
1868     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
1869     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
1870     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
1871     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
1872     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
1873     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
1874     fWSegSpaceSet     = new UnicodeSet(u"[\\p{Word_Break = WSegSpace}]",    status);
1875
1876     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
1877     fExtendedPictSet  = new UnicodeSet(u"[:Extended_Pictographic:]", status);
1878
1879     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
1880     fDictionarySet->addAll(*fKatakanaSet);
1881     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
1882
1883     fALetterSet->removeAll(*fDictionarySet);
1884
1885     fOtherSet        = new UnicodeSet();
1886     if(U_FAILURE(status)) {
1887         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
1888         deferredStatus = status;
1889         return;
1890     }
1891
1892     fOtherSet->complement();
1893     fOtherSet->removeAll(*fCRSet);
1894     fOtherSet->removeAll(*fLFSet);
1895     fOtherSet->removeAll(*fNewlineSet);
1896     fOtherSet->removeAll(*fKatakanaSet);
1897     fOtherSet->removeAll(*fHebrew_LetterSet);
1898     fOtherSet->removeAll(*fALetterSet);
1899     fOtherSet->removeAll(*fSingle_QuoteSet);
1900     fOtherSet->removeAll(*fDouble_QuoteSet);
1901     fOtherSet->removeAll(*fMidLetterSet);
1902     fOtherSet->removeAll(*fMidNumSet);
1903     fOtherSet->removeAll(*fNumericSet);
1904     fOtherSet->removeAll(*fExtendNumLetSet);
1905     fOtherSet->removeAll(*fWSegSpaceSet);
1906     fOtherSet->removeAll(*fFormatSet);
1907     fOtherSet->removeAll(*fExtendSet);
1908     fOtherSet->removeAll(*fRegionalIndicatorSet);
1909     fOtherSet->removeAll(*fZWJSet);
1910     fOtherSet->removeAll(*fExtendedPictSet);
1911
1912     // Inhibit dictionary characters from being tested at all.
1913     fOtherSet->removeAll(*fDictionarySet);
1914
1915     fSets->addElement(fCRSet,                status);
1916     fSets->addElement(fLFSet,                status);
1917     fSets->addElement(fNewlineSet,           status);
1918     fSets->addElement(fRegionalIndicatorSet, status);
1919     fSets->addElement(fHebrew_LetterSet,     status);
1920     fSets->addElement(fALetterSet,           status);
1921     fSets->addElement(fSingle_QuoteSet,      status);
1922     fSets->addElement(fDouble_QuoteSet,      status);
1923     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
1924                                                         // from the test data. They are all in the dictionary set,
1925                                                         // which this (old, to be retired) monkey test cannot handle.
1926     fSets->addElement(fMidLetterSet,         status);
1927     fSets->addElement(fMidNumLetSet,         status);
1928     fSets->addElement(fMidNumSet,            status);
1929     fSets->addElement(fNumericSet,           status);
1930     fSets->addElement(fFormatSet,            status);
1931     fSets->addElement(fExtendSet,            status);
1932     fSets->addElement(fOtherSet,             status);
1933     fSets->addElement(fExtendNumLetSet,      status);
1934     fSets->addElement(fWSegSpaceSet,         status);
1935
1936     fSets->addElement(fZWJSet,               status);
1937     fSets->addElement(fExtendedPictSet,      status);
1938
1939     if (U_FAILURE(status)) {
1940         deferredStatus = status;
1941     }
1942 }
1943
1944 void RBBIWordMonkey::setText(const UnicodeString &s) {
1945     fText       = &s;
1946 }
1947
1948
1949 int32_t RBBIWordMonkey::next(int32_t prevPos) {
1950     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1951                               //   break position being tested.  The candidate break
1952                               //   location is before p2.
1953
1954     int     breakPos = -1;
1955
1956     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1957
1958     if (U_FAILURE(deferredStatus)) {
1959         return -1;
1960     }
1961
1962     // Prev break at end of string.  return DONE.
1963     if (prevPos >= fText->length()) {
1964         return -1;
1965     }
1966     p0 = p1 = p2 = p3 = prevPos;
1967     c3 =  fText->char32At(prevPos);
1968     c0 = c1 = c2 = 0;
1969     (void)p0;       // Suppress set but not used warning.
1970
1971     // Loop runs once per "significant" character position in the input text.
1972     for (;;) {
1973         // Move all of the positions forward in the input string.
1974         p0 = p1;  c0 = c1;
1975         p1 = p2;  c1 = c2;
1976         p2 = p3;  c2 = c3;
1977
1978         // Advancd p3 by    X(Extend | Format)*   Rule 4
1979         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
1980         do {
1981             p3 = fText->moveIndex32(p3, 1);
1982             c3 = fText->char32At(p3);
1983             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
1984                break;
1985             };
1986         }
1987         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
1988
1989
1990         if (p1 == p2) {
1991             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1992             continue;
1993         }
1994         if (p2 == fText->length()) {
1995             // Reached end of string.  Always a break position.
1996             break;
1997         }
1998
1999         // Rule  (3)   CR x LF
2000         //     No Extend or Format characters may appear between the CR and LF,
2001         //     which requires the additional check for p2 immediately following p1.
2002         //
2003         if (c1==0x0D && c2==0x0A) {
2004             continue;
2005         }
2006
2007         // Rule (3a)  Break before and after newlines (including CR and LF)
2008         //
2009         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2010             break;
2011         };
2012         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2013             break;
2014         };
2015
2016         // Rule (3c)    ZWJ x Extended_Pictographic
2017         //              Not ignoring extend chars, so peek into input text to
2018         //              get the potential ZWJ, the character immediately preceding c2.
2019         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2020         //              but char32At will get the full code point.
2021         if (fZWJSet->contains(fText->char32At(p2-1)) && fExtendedPictSet->contains(c2)) {
2022             continue;
2023         }
2024
2025         // Rule (3d)    Keep horizontal whitespace together.
2026         if (fWSegSpaceSet->contains(fText->char32At(p2-1)) && fWSegSpaceSet->contains(c2)) {
2027             continue;
2028         }
2029
2030         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2031         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2032             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2033             continue;
2034         }
2035
2036         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2037         //
2038         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2039              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2040              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2041             continue;
2042         }
2043
2044         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2045         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2046             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2047             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2048             continue;
2049         }
2050
2051         // Rule (7a)     Hebrew_Letter x Single_Quote
2052         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2053             continue;
2054         }
2055
2056         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2057         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2058             continue;
2059         }
2060
2061         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2062         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2063             continue;
2064         }
2065
2066         // Rule (8)    Numeric x Numeric
2067         if (fNumericSet->contains(c1) &&
2068             fNumericSet->contains(c2))  {
2069             continue;
2070         }
2071
2072         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2073         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2074             fNumericSet->contains(c2))  {
2075             continue;
2076         }
2077
2078         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2079         if (fNumericSet->contains(c1) &&
2080             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2081             continue;
2082         }
2083
2084         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2085         if (fNumericSet->contains(c0) &&
2086             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2087             fNumericSet->contains(c2)) {
2088             continue;
2089         }
2090
2091         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2092         if (fNumericSet->contains(c1) &&
2093             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2094             fNumericSet->contains(c3)) {
2095             continue;
2096         }
2097
2098         // Rule (13)  Katakana x Katakana
2099         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2100         //                  all Katakana are handled by the dictionary breaker.
2101         if (fKatakanaSet->contains(c1) &&
2102             fKatakanaSet->contains(c2))  {
2103             continue;
2104         }
2105
2106         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2107         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2108              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2109              fExtendNumLetSet->contains(c2)) {
2110                 continue;
2111         }
2112
2113         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2114         if (fExtendNumLetSet->contains(c1) &&
2115                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2116                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2117             continue;
2118         }
2119
2120         // Rule 15 - 17   Group pairs of Regional Indicators.
2121         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2122             break;
2123         }
2124         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2125             continue;
2126         }
2127
2128         // Rule 999.  Break found here.
2129         break;
2130     }
2131
2132     breakPos = p2;
2133     return breakPos;
2134 }
2135
2136
2137 UVector  *RBBIWordMonkey::charClasses() {
2138     return fSets;
2139 }
2140
2141
2142 RBBIWordMonkey::~RBBIWordMonkey() {
2143     delete fSets;
2144     delete fCRSet;
2145     delete fLFSet;
2146     delete fNewlineSet;
2147     delete fKatakanaSet;
2148     delete fHebrew_LetterSet;
2149     delete fALetterSet;
2150     delete fSingle_QuoteSet;
2151     delete fDouble_QuoteSet;
2152     delete fMidNumLetSet;
2153     delete fMidLetterSet;
2154     delete fMidNumSet;
2155     delete fNumericSet;
2156     delete fFormatSet;
2157     delete fExtendSet;
2158     delete fExtendNumLetSet;
2159     delete fWSegSpaceSet;
2160     delete fRegionalIndicatorSet;
2161     delete fDictionarySet;
2162     delete fOtherSet;
2163     delete fZWJSet;
2164     delete fExtendedPictSet;
2165 }
2166
2167
2168
2169
2170 //------------------------------------------------------------------------------------------
2171 //
2172 //   class RBBISentMonkey      Sentence Break specific implementation
2173 //                             of RBBIMonkeyKind.
2174 //
2175 //------------------------------------------------------------------------------------------
2176 class RBBISentMonkey: public RBBIMonkeyKind {
2177 public:
2178     RBBISentMonkey();
2179     virtual          ~RBBISentMonkey();
2180     virtual  UVector *charClasses();
2181     virtual  void     setText(const UnicodeString &s);
2182     virtual int32_t   next(int32_t i);
2183 private:
2184     int               moveBack(int posFrom);
2185     int               moveForward(int posFrom);
2186     UChar32           cAt(int pos);
2187
2188     UVector      *fSets;
2189
2190     UnicodeSet  *fSepSet;
2191     UnicodeSet  *fFormatSet;
2192     UnicodeSet  *fSpSet;
2193     UnicodeSet  *fLowerSet;
2194     UnicodeSet  *fUpperSet;
2195     UnicodeSet  *fOLetterSet;
2196     UnicodeSet  *fNumericSet;
2197     UnicodeSet  *fATermSet;
2198     UnicodeSet  *fSContinueSet;
2199     UnicodeSet  *fSTermSet;
2200     UnicodeSet  *fCloseSet;
2201     UnicodeSet  *fOtherSet;
2202     UnicodeSet  *fExtendSet;
2203
2204     const UnicodeString  *fText;
2205
2206 };
2207
2208 RBBISentMonkey::RBBISentMonkey()
2209 {
2210     UErrorCode  status = U_ZERO_ERROR;
2211
2212     fSets            = new UVector(status);
2213
2214     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2215     //                       set and made into character classes of their own.  For the monkey impl,
2216     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2217     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2218     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2219     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2220     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2221     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2222     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2223     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2224     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2225     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2226     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2227     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2228     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2229     fOtherSet        = new UnicodeSet();
2230
2231     if(U_FAILURE(status)) {
2232       deferredStatus = status;
2233       return;
2234     }
2235
2236     fOtherSet->complement();
2237     fOtherSet->removeAll(*fSepSet);
2238     fOtherSet->removeAll(*fFormatSet);
2239     fOtherSet->removeAll(*fSpSet);
2240     fOtherSet->removeAll(*fLowerSet);
2241     fOtherSet->removeAll(*fUpperSet);
2242     fOtherSet->removeAll(*fOLetterSet);
2243     fOtherSet->removeAll(*fNumericSet);
2244     fOtherSet->removeAll(*fATermSet);
2245     fOtherSet->removeAll(*fSContinueSet);
2246     fOtherSet->removeAll(*fSTermSet);
2247     fOtherSet->removeAll(*fCloseSet);
2248     fOtherSet->removeAll(*fExtendSet);
2249
2250     fSets->addElement(fSepSet,       status);
2251     fSets->addElement(fFormatSet,    status);
2252     fSets->addElement(fSpSet,        status);
2253     fSets->addElement(fLowerSet,     status);
2254     fSets->addElement(fUpperSet,     status);
2255     fSets->addElement(fOLetterSet,   status);
2256     fSets->addElement(fNumericSet,   status);
2257     fSets->addElement(fATermSet,     status);
2258     fSets->addElement(fSContinueSet, status);
2259     fSets->addElement(fSTermSet,     status);
2260     fSets->addElement(fCloseSet,     status);
2261     fSets->addElement(fOtherSet,     status);
2262     fSets->addElement(fExtendSet,    status);
2263
2264     if (U_FAILURE(status)) {
2265         deferredStatus = status;
2266     }
2267 }
2268
2269
2270
2271 void RBBISentMonkey::setText(const UnicodeString &s) {
2272     fText       = &s;
2273 }
2274
2275 UVector  *RBBISentMonkey::charClasses() {
2276     return fSets;
2277 }
2278
2279
2280 //  moveBack()   Find the "significant" code point preceding the index i.
2281 //               Skips over ($Extend | $Format)* .
2282 //
2283 int RBBISentMonkey::moveBack(int i) {
2284     if (i <= 0) {
2285         return -1;
2286     }
2287     UChar32   c;
2288     int32_t   j = i;
2289     do {
2290         j = fText->moveIndex32(j, -1);
2291         c = fText->char32At(j);
2292     }
2293     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2294     return j;
2295
2296  }
2297
2298
2299 int RBBISentMonkey::moveForward(int i) {
2300     if (i>=fText->length()) {
2301         return fText->length();
2302     }
2303     UChar32   c;
2304     int32_t   j = i;
2305     do {
2306         j = fText->moveIndex32(j, 1);
2307         c = cAt(j);
2308     }
2309     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2310     return j;
2311 }
2312
2313 UChar32 RBBISentMonkey::cAt(int pos) {
2314     if (pos<0 || pos>=fText->length()) {
2315         return -1;
2316     } else {
2317         return fText->char32At(pos);
2318     }
2319 }
2320
2321 int32_t RBBISentMonkey::next(int32_t prevPos) {
2322     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2323                               //   break position being tested.  The candidate break
2324                               //   location is before p2.
2325
2326     int     breakPos = -1;
2327
2328     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2329     UChar32 c;
2330
2331     if (U_FAILURE(deferredStatus)) {
2332         return -1;
2333     }
2334
2335     // Prev break at end of string.  return DONE.
2336     if (prevPos >= fText->length()) {
2337         return -1;
2338     }
2339     p0 = p1 = p2 = p3 = prevPos;
2340     c3 =  fText->char32At(prevPos);
2341     c0 = c1 = c2 = 0;
2342     (void)p0;     // Suppress set but not used warning.
2343
2344     // Loop runs once per "significant" character position in the input text.
2345     for (;;) {
2346         // Move all of the positions forward in the input string.
2347         p0 = p1;  c0 = c1;
2348         p1 = p2;  c1 = c2;
2349         p2 = p3;  c2 = c3;
2350
2351         // Advancd p3 by    X(Extend | Format)*   Rule 4
2352         p3 = moveForward(p3);
2353         c3 = cAt(p3);
2354
2355         // Rule (3)  CR x LF
2356         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2357             continue;
2358         }
2359
2360         // Rule (4).   Sep  <break>
2361         if (fSepSet->contains(c1)) {
2362             p2 = p1+1;   // Separators don't combine with Extend or Format.
2363             break;
2364         }
2365
2366         if (p2 >= fText->length()) {
2367             // Reached end of string.  Always a break position.
2368             break;
2369         }
2370
2371         if (p2 == prevPos) {
2372             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2373             continue;
2374         }
2375
2376         // Rule (6).   ATerm x Numeric
2377         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2378             continue;
2379         }
2380
2381         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2382         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2383                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2384             continue;
2385         }
2386
2387         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2388         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2389         //                  note to the Unicode 5.0 documents.
2390         int p8 = p1;
2391         while (fSpSet->contains(cAt(p8))) {
2392             p8 = moveBack(p8);
2393         }
2394         while (fCloseSet->contains(cAt(p8))) {
2395             p8 = moveBack(p8);
2396         }
2397         if (fATermSet->contains(cAt(p8))) {
2398             p8=p2;
2399             for (;;) {
2400                 c = cAt(p8);
2401                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2402                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2403                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2404                     break;
2405                 }
2406                 p8 = moveForward(p8);
2407             }
2408             if (fLowerSet->contains(cAt(p8))) {
2409                 continue;
2410             }
2411         }
2412
2413         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2414         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2415             p8 = p1;
2416             while (fSpSet->contains(cAt(p8))) {
2417                 p8 = moveBack(p8);
2418             }
2419             while (fCloseSet->contains(cAt(p8))) {
2420                 p8 = moveBack(p8);
2421             }
2422             c = cAt(p8);
2423             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2424                 continue;
2425             }
2426         }
2427
2428         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2429         int p9 = p1;
2430         while (fCloseSet->contains(cAt(p9))) {
2431             p9 = moveBack(p9);
2432         }
2433         c = cAt(p9);
2434         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2435             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2436                 continue;
2437             }
2438         }
2439
2440         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2441         int p10 = p1;
2442         while (fSpSet->contains(cAt(p10))) {
2443             p10 = moveBack(p10);
2444         }
2445         while (fCloseSet->contains(cAt(p10))) {
2446             p10 = moveBack(p10);
2447         }
2448         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2449             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2450                 continue;
2451             }
2452         }
2453
2454         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2455         int p11 = p1;
2456         if (fSepSet->contains(cAt(p11))) {
2457             p11 = moveBack(p11);
2458         }
2459         while (fSpSet->contains(cAt(p11))) {
2460             p11 = moveBack(p11);
2461         }
2462         while (fCloseSet->contains(cAt(p11))) {
2463             p11 = moveBack(p11);
2464         }
2465         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2466             break;
2467         }
2468
2469         //  Rule (12)  Any x Any
2470         continue;
2471     }
2472     breakPos = p2;
2473     return breakPos;
2474 }
2475
2476 RBBISentMonkey::~RBBISentMonkey() {
2477     delete fSets;
2478     delete fSepSet;
2479     delete fFormatSet;
2480     delete fSpSet;
2481     delete fLowerSet;
2482     delete fUpperSet;
2483     delete fOLetterSet;
2484     delete fNumericSet;
2485     delete fATermSet;
2486     delete fSContinueSet;
2487     delete fSTermSet;
2488     delete fCloseSet;
2489     delete fOtherSet;
2490     delete fExtendSet;
2491 }
2492
2493
2494
2495 //-------------------------------------------------------------------------------------------
2496 //
2497 //  RBBILineMonkey
2498 //
2499 //-------------------------------------------------------------------------------------------
2500
2501 class RBBILineMonkey: public RBBIMonkeyKind {
2502 public:
2503     RBBILineMonkey();
2504     virtual          ~RBBILineMonkey();
2505     virtual  UVector *charClasses();
2506     virtual  void     setText(const UnicodeString &s);
2507     virtual  int32_t  next(int32_t i);
2508     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2509 private:
2510     UVector      *fSets;
2511
2512     UnicodeSet  *fBK;
2513     UnicodeSet  *fCR;
2514     UnicodeSet  *fLF;
2515     UnicodeSet  *fCM;
2516     UnicodeSet  *fNL;
2517     UnicodeSet  *fSG;
2518     UnicodeSet  *fWJ;
2519     UnicodeSet  *fZW;
2520     UnicodeSet  *fGL;
2521     UnicodeSet  *fCB;
2522     UnicodeSet  *fSP;
2523     UnicodeSet  *fB2;
2524     UnicodeSet  *fBA;
2525     UnicodeSet  *fBB;
2526     UnicodeSet  *fHY;
2527     UnicodeSet  *fH2;
2528     UnicodeSet  *fH3;
2529     UnicodeSet  *fCL;
2530     UnicodeSet  *fCP;
2531     UnicodeSet  *fEX;
2532     UnicodeSet  *fIN;
2533     UnicodeSet  *fJL;
2534     UnicodeSet  *fJV;
2535     UnicodeSet  *fJT;
2536     UnicodeSet  *fNS;
2537     UnicodeSet  *fOP;
2538     UnicodeSet  *fQU;
2539     UnicodeSet  *fIS;
2540     UnicodeSet  *fNU;
2541     UnicodeSet  *fPO;
2542     UnicodeSet  *fPR;
2543     UnicodeSet  *fSY;
2544     UnicodeSet  *fAI;
2545     UnicodeSet  *fAL;
2546     UnicodeSet  *fCJ;
2547     UnicodeSet  *fHL;
2548     UnicodeSet  *fID;
2549     UnicodeSet  *fRI;
2550     UnicodeSet  *fXX;
2551     UnicodeSet  *fEB;
2552     UnicodeSet  *fEM;
2553     UnicodeSet  *fZJ;
2554
2555     BreakIterator        *fCharBI;
2556     const UnicodeString  *fText;
2557     RegexMatcher         *fNumberMatcher;
2558 };
2559
2560 RBBILineMonkey::RBBILineMonkey() :
2561     RBBIMonkeyKind(),
2562     fSets(NULL),
2563
2564     fCharBI(NULL),
2565     fText(NULL),
2566     fNumberMatcher(NULL)
2567
2568 {
2569     if (U_FAILURE(deferredStatus)) {
2570         return;
2571     }
2572
2573     UErrorCode  status = U_ZERO_ERROR;
2574
2575     fSets  = new UVector(status);
2576
2577     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2578     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2579     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2580     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2581     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2582     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2583     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2584     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2585     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2586     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2587     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2588     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2589     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2590     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2591     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2592     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2593     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2594     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2595     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2596     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2597     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2598     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2599     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2600     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2601     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2602     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2603     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2604     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2605     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2606     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2607     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2608     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2609     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2610     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2611     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2612     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2613     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2614     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2615     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2616     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2617     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
2618     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
2619
2620     if (U_FAILURE(status)) {
2621         deferredStatus = status;
2622         return;
2623     }
2624
2625     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2626     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2627     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2628
2629     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2630     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
2631
2632     fSets->addElement(fBK, status);
2633     fSets->addElement(fCR, status);
2634     fSets->addElement(fLF, status);
2635     fSets->addElement(fCM, status);
2636     fSets->addElement(fNL, status);
2637     fSets->addElement(fWJ, status);
2638     fSets->addElement(fZW, status);
2639     fSets->addElement(fGL, status);
2640     fSets->addElement(fCB, status);
2641     fSets->addElement(fSP, status);
2642     fSets->addElement(fB2, status);
2643     fSets->addElement(fBA, status);
2644     fSets->addElement(fBB, status);
2645     fSets->addElement(fHY, status);
2646     fSets->addElement(fH2, status);
2647     fSets->addElement(fH3, status);
2648     fSets->addElement(fCL, status);
2649     fSets->addElement(fCP, status);
2650     fSets->addElement(fEX, status);
2651     fSets->addElement(fIN, status);
2652     fSets->addElement(fJL, status);
2653     fSets->addElement(fJT, status);
2654     fSets->addElement(fJV, status);
2655     fSets->addElement(fNS, status);
2656     fSets->addElement(fOP, status);
2657     fSets->addElement(fQU, status);
2658     fSets->addElement(fIS, status);
2659     fSets->addElement(fNU, status);
2660     fSets->addElement(fPO, status);
2661     fSets->addElement(fPR, status);
2662     fSets->addElement(fSY, status);
2663     fSets->addElement(fAI, status);
2664     fSets->addElement(fAL, status);
2665     fSets->addElement(fHL, status);
2666     fSets->addElement(fID, status);
2667     fSets->addElement(fWJ, status);
2668     fSets->addElement(fRI, status);
2669     fSets->addElement(fSG, status);
2670     fSets->addElement(fEB, status);
2671     fSets->addElement(fEM, status);
2672     fSets->addElement(fZJ, status);
2673
2674
2675     const char *rules =
2676             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
2677             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
2678             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
2679             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
2680             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
2681             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
2682
2683     fNumberMatcher = new RegexMatcher(
2684         UnicodeString(rules, -1, US_INV), 0, status);
2685
2686     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2687
2688     if (U_FAILURE(status)) {
2689         deferredStatus = status;
2690     }
2691 }
2692
2693
2694 void RBBILineMonkey::setText(const UnicodeString &s) {
2695     fText       = &s;
2696     fCharBI->setText(s);
2697     fNumberMatcher->reset(s);
2698 }
2699
2700 //
2701 //  rule9Adjust
2702 //     Line Break TR rules 9 and 10 implementation.
2703 //     This deals with combining marks and other sequences that
2704 //     that must be treated as if they were something other than what they actually are.
2705 //
2706 //     This is factored out into a separate function because it must be applied twice for
2707 //     each potential break, once to the chars before the position being checked, then
2708 //     again to the text following the possible break.
2709 //
2710 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2711     if (pos == -1) {
2712         // Invalid initial position.  Happens during the warmup iteration of the
2713         //   main loop in next().
2714         return;
2715     }
2716
2717     int32_t  nPos = *nextPos;
2718
2719     // LB 9  Keep combining sequences together.
2720     //  advance over any CM class chars.  Note that Line Break CM is different
2721     //  from the normal Grapheme Extend property.
2722     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2723           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2724         for (;;) {
2725             *nextChar = fText->char32At(nPos);
2726             if (!fCM->contains(*nextChar)) {
2727                 break;
2728             }
2729             nPos = fText->moveIndex32(nPos, 1);
2730         }
2731     }
2732
2733
2734     // LB 9 Treat X CM* as if it were x.
2735     //       No explicit action required.
2736
2737     // LB 10  Treat any remaining combining mark as AL
2738     if (fCM->contains(*posChar)) {
2739         *posChar = u'A';
2740     }
2741
2742     // Push the updated nextPos and nextChar back to our caller.
2743     // This only makes a difference if posChar got bigger by consuming a
2744     // combining sequence.
2745     *nextPos  = nPos;
2746     *nextChar = fText->char32At(nPos);
2747 }
2748
2749
2750
2751 int32_t RBBILineMonkey::next(int32_t startPos) {
2752     UErrorCode status = U_ZERO_ERROR;
2753     int32_t    pos;       //  Index of the char following a potential break position
2754     UChar32    thisChar;  //  Character at above position "pos"
2755
2756     int32_t    prevPos;   //  Index of the char preceding a potential break position
2757     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2758                           //   and thisChar may not be adjacent because combining
2759                           //   characters between them will be ignored.
2760
2761     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2762     UChar32    prevCharX2;
2763
2764     int32_t    nextPos;   //  Index of the next character following pos.
2765                           //     Usually skips over combining marks.
2766     int32_t    nextCPPos; //  Index of the code point following "pos."
2767                           //     May point to a combining mark.
2768     int32_t    tPos;      //  temp value.
2769     UChar32    c;
2770
2771     if (U_FAILURE(deferredStatus)) {
2772         return -1;
2773     }
2774
2775     if (startPos >= fText->length()) {
2776         return -1;
2777     }
2778
2779
2780     // Initial values for loop.  Loop will run the first time without finding breaks,
2781     //                           while the invalid values shift out and the "this" and
2782     //                           "prev" positions are filled in with good values.
2783     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2784     thisChar = prevChar  = prevCharX2 = 0;
2785     nextPos  = nextCPPos = startPos;
2786
2787
2788     // Loop runs once per position in the test text, until a break position
2789     //  is found.
2790     for (;;) {
2791         prevPosX2 = prevPos;
2792         prevCharX2 = prevChar;
2793
2794         prevPos   = pos;
2795         prevChar  = thisChar;
2796
2797         pos       = nextPos;
2798         thisChar  = fText->char32At(pos);
2799
2800         nextCPPos = fText->moveIndex32(pos, 1);
2801         nextPos   = nextCPPos;
2802
2803         // Rule LB2 - Break at end of text.
2804         if (pos >= fText->length()) {
2805             break;
2806         }
2807
2808         // Rule LB 9 - adjust for combining sequences.
2809         //             We do this one out-of-order because the adjustment does not change anything
2810         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2811         //             be applied.
2812         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2813         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2814         c = fText->char32At(nextPos);
2815         rule9Adjust(pos,     &thisChar, &nextPos, &c);
2816
2817         // If the loop is still warming up - if we haven't shifted the initial
2818         //   -1 positions out of prevPos yet - loop back to advance the
2819         //    position in the input without any further looking for breaks.
2820         if (prevPos == -1) {
2821             continue;
2822         }
2823
2824         // LB 4  Always break after hard line breaks,
2825         if (fBK->contains(prevChar)) {
2826             break;
2827         }
2828
2829         // LB 5  Break after CR, LF, NL, but not inside CR LF
2830         if (prevChar == 0x0d && thisChar == 0x0a) {
2831             continue;
2832         }
2833         if (prevChar == 0x0d ||
2834             prevChar == 0x0a ||
2835             prevChar == 0x85)  {
2836             break;
2837         }
2838
2839         // LB 6  Don't break before hard line breaks
2840         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
2841             fBK->contains(thisChar)) {
2842                 continue;
2843         }
2844
2845
2846         // LB 7  Don't break before spaces or zero-width space.
2847         if (fSP->contains(thisChar)) {
2848             continue;
2849         }
2850
2851         if (fZW->contains(thisChar)) {
2852             continue;
2853         }
2854
2855         // LB 8  Break after zero width space
2856         if (fZW->contains(prevChar)) {
2857             break;
2858         }
2859
2860         // LB 25    Numbers
2861         //          Move this test up, before LB8a, because numbers can match a longer sequence that would
2862         //          also match 8a.  e.g. NU ZWJ IS PO     (ZWJ acts like CM)
2863         if (fNumberMatcher->lookingAt(prevPos, status)) {
2864             if (U_FAILURE(status)) {
2865                 break;
2866             }
2867             // Matched a number.  But could have been just a single digit, which would
2868             //    not represent a "no break here" between prevChar and thisChar
2869             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
2870             if (numEndIdx > pos) {
2871                 // Number match includes at least our two chars being checked
2872                 if (numEndIdx > nextPos) {
2873                     // Number match includes additional chars.  Update pos and nextPos
2874                     //   so that next loop iteration will continue at the end of the number,
2875                     //   checking for breaks between last char in number & whatever follows.
2876                     pos = nextPos = numEndIdx;
2877                     do {
2878                         pos = fText->moveIndex32(pos, -1);
2879                         thisChar = fText->char32At(pos);
2880                     } while (fCM->contains(thisChar));
2881                 }
2882                 continue;
2883             }
2884         }
2885
2886         // LB 8a ZWJ x
2887         //       The monkey test's way of ignoring combining characters doesn't work
2888         //       for this rule. ZJ is also a CM. Need to get the actual character
2889         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
2890         {
2891             int32_t prevIdx = fText->moveIndex32(pos, -1);
2892             UChar32 prevC = fText->char32At(prevIdx);
2893             if (fZJ->contains(prevC)) {
2894                 continue;
2895             }
2896         }
2897
2898         // LB 9, 10  Already done, at top of loop.
2899         //
2900
2901
2902         // LB 11  Do not break before or after WORD JOINER and related characters.
2903         //    x  WJ
2904         //    WJ  x
2905         //
2906         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
2907             continue;
2908         }
2909
2910         // LB 12
2911         //    GL  x
2912         if (fGL->contains(prevChar)) {
2913             continue;
2914         }
2915
2916         // LB 12a
2917         //    [^SP BA HY] x GL
2918         if (!(fSP->contains(prevChar) ||
2919               fBA->contains(prevChar) ||
2920               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
2921             continue;
2922         }
2923
2924
2925
2926         // LB 13  Don't break before closings.
2927         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
2928         //        fall into LB 17 and the more general number regular expression.
2929         //
2930         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
2931             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
2932                                          fEX->contains(thisChar)  ||
2933             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
2934             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
2935             continue;
2936         }
2937
2938         // LB 14 Don't break after OP SP*
2939         //       Scan backwards, checking for this sequence.
2940         //       The OP char could include combining marks, so we actually check for
2941         //           OP CM* SP*
2942         //       Another Twist: The Rule 67 fixes may have changed a SP CM
2943         //       sequence into a ID char, so before scanning back through spaces,
2944         //       verify that prevChar is indeed a space.  The prevChar variable
2945         //       may differ from fText[prevPos]
2946         tPos = prevPos;
2947         if (fSP->contains(prevChar)) {
2948             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2949                 tPos=fText->moveIndex32(tPos, -1);
2950             }
2951         }
2952         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
2953             tPos=fText->moveIndex32(tPos, -1);
2954         }
2955         if (fOP->contains(fText->char32At(tPos))) {
2956             continue;
2957         }
2958
2959
2960         // LB 15    QU SP* x OP
2961         if (fOP->contains(thisChar)) {
2962             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
2963             int tPos = prevPos;
2964             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2965                 tPos = fText->moveIndex32(tPos, -1);
2966             }
2967             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2968                 tPos = fText->moveIndex32(tPos, -1);
2969             }
2970             if (fQU->contains(fText->char32At(tPos))) {
2971                 continue;
2972             }
2973         }
2974
2975
2976
2977         // LB 16   (CL | CP) SP* x NS
2978         //    Scan backwards for SP* CM* (CL | CP)
2979         if (fNS->contains(thisChar)) {
2980             int tPos = prevPos;
2981             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
2982                 tPos = fText->moveIndex32(tPos, -1);
2983             }
2984             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
2985                 tPos = fText->moveIndex32(tPos, -1);
2986             }
2987             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
2988                 continue;
2989             }
2990         }
2991
2992
2993         // LB 17        B2 SP* x B2
2994         if (fB2->contains(thisChar)) {
2995             //  Scan backwards, checking for the B2 CM* SP* sequence.
2996             tPos = prevPos;
2997             if (fSP->contains(prevChar)) {
2998                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
2999                     tPos=fText->moveIndex32(tPos, -1);
3000                 }
3001             }
3002             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3003                 tPos=fText->moveIndex32(tPos, -1);
3004             }
3005             if (fB2->contains(fText->char32At(tPos))) {
3006                 continue;
3007             }
3008         }
3009
3010
3011         // LB 18    break after space
3012         if (fSP->contains(prevChar)) {
3013             break;
3014         }
3015
3016         // LB 19
3017         //    x   QU
3018         //    QU  x
3019         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3020             continue;
3021         }
3022
3023         // LB 20  Break around a CB
3024         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3025             break;
3026         }
3027
3028         // LB 21
3029         if (fBA->contains(thisChar) ||
3030             fHY->contains(thisChar) ||
3031             fNS->contains(thisChar) ||
3032             fBB->contains(prevChar) )   {
3033             continue;
3034         }
3035
3036         // LB 21a
3037         //   HL (HY | BA) x
3038         if (fHL->contains(prevCharX2) &&
3039                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3040             continue;
3041         }
3042
3043         // LB 21b
3044         //   SY x HL
3045         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3046             continue;
3047         }
3048
3049         // LB 22
3050         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3051             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3052             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3053             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3054             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3055             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3056             continue;
3057         }
3058
3059
3060         // LB 23    (AL | HL) x NU
3061         //          NU x (AL | HL)
3062         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3063             continue;
3064         }
3065         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3066             continue;
3067         }
3068
3069         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3070         //      PR x (ID | EB | EM)
3071         //     (ID | EB | EM) x PO
3072         if (fPR->contains(prevChar) &&
3073                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3074             continue;
3075         }
3076         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3077                 fPO->contains(thisChar)) {
3078             continue;
3079         }
3080
3081         // LB 24  Do not break between prefix and letters or ideographs.
3082         //         (PR | PO) x (AL | HL)
3083         //         (AL | HL) x (PR | PO)
3084         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3085                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3086             continue;
3087         }
3088         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3089                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3090             continue;
3091         }
3092
3093         // LB 25 numbers match, moved up, before LB 8a,
3094
3095         // LB 26 Do not break a Korean syllable.
3096         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3097                                         fJV->contains(thisChar) ||
3098                                         fH2->contains(thisChar) ||
3099                                         fH3->contains(thisChar))) {
3100                                             continue;
3101                                         }
3102
3103         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3104             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3105                 continue;
3106         }
3107
3108         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3109             fJT->contains(thisChar)) {
3110                 continue;
3111         }
3112
3113         // LB 27 Treat a Korean Syllable Block the same as ID.
3114         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3115             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3116             fIN->contains(thisChar)) {
3117                 continue;
3118             }
3119         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3120             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3121             fPO->contains(thisChar)) {
3122                 continue;
3123             }
3124         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3125             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3126                 continue;
3127             }
3128
3129
3130
3131         // LB 28  Do not break between alphabetics ("at").
3132         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3133             continue;
3134         }
3135
3136         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3137         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3138             continue;
3139         }
3140
3141         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3142         //          (AL | NU) x OP
3143         //          CP x (AL | NU)
3144         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3145             continue;
3146         }
3147         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3148             continue;
3149         }
3150
3151         // LB30a    RI RI <break> RI
3152         //             RI    x    RI
3153         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3154             break;
3155         }
3156         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3157             continue;
3158         }
3159
3160         // LB30b    Emoji Base x Emoji Modifier
3161         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3162             continue;
3163         }
3164
3165         // LB 31    Break everywhere else
3166         break;
3167
3168     }
3169
3170     return pos;
3171 }
3172
3173
3174 UVector  *RBBILineMonkey::charClasses() {
3175     return fSets;
3176 }
3177
3178
3179 RBBILineMonkey::~RBBILineMonkey() {
3180     delete fSets;
3181
3182     delete fBK;
3183     delete fCR;
3184     delete fLF;
3185     delete fCM;
3186     delete fNL;
3187     delete fWJ;
3188     delete fZW;
3189     delete fGL;
3190     delete fCB;
3191     delete fSP;
3192     delete fB2;
3193     delete fBA;
3194     delete fBB;
3195     delete fHY;
3196     delete fH2;
3197     delete fH3;
3198     delete fCL;
3199     delete fCP;
3200     delete fEX;
3201     delete fIN;
3202     delete fJL;
3203     delete fJV;
3204     delete fJT;
3205     delete fNS;
3206     delete fOP;
3207     delete fQU;
3208     delete fIS;
3209     delete fNU;
3210     delete fPO;
3211     delete fPR;
3212     delete fSY;
3213     delete fAI;
3214     delete fAL;
3215     delete fCJ;
3216     delete fHL;
3217     delete fID;
3218     delete fRI;
3219     delete fSG;
3220     delete fXX;
3221     delete fEB;
3222     delete fEM;
3223     delete fZJ;
3224
3225     delete fCharBI;
3226     delete fNumberMatcher;
3227 }
3228
3229
3230 //-------------------------------------------------------------------------------------------
3231 //
3232 //   TestMonkey
3233 //
3234 //     params
3235 //       seed=nnnnn        Random number starting seed.
3236 //                         Setting the seed allows errors to be reproduced.
3237 //       loop=nnn          Looping count.  Controls running time.
3238 //                         -1:  run forever.
3239 //                          0 or greater:  run length.
3240 //
3241 //       type = char | word | line | sent | title
3242 //
3243 //  Example:
3244 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3245 //
3246 //-------------------------------------------------------------------------------------------
3247
3248 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3249     int32_t val = defaultVal;
3250     name.append(" *= *(-?\\d+)");
3251     UErrorCode status = U_ZERO_ERROR;
3252     RegexMatcher m(name, params, 0, status);
3253     if (m.find()) {
3254         // The param exists.  Convert the string to an int.
3255         char valString[100];
3256         int32_t paramLength = m.end(1, status) - m.start(1, status);
3257         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3258             paramLength = (int32_t)(sizeof(valString)-2);
3259         }
3260         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3261         val = strtol(valString,  NULL, 10);
3262
3263         // Delete this parameter from the params string.
3264         m.reset();
3265         params = m.replaceFirst("", status);
3266     }
3267     U_ASSERT(U_SUCCESS(status));
3268     return val;
3269 }
3270 #endif
3271
3272 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3273 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3274                                     BreakIterator *bi,
3275                                     int expected[],
3276                                     int expectedcount)
3277 {
3278     int count = 0;
3279     int i = 0;
3280     int forward[50];
3281     bi->setText(ustr);
3282     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3283         forward[count] = i;
3284         if (count < expectedcount && expected[count] != i) {
3285             test->errln("%s:%d break forward test failed: expected %d but got %d",
3286                         __FILE__, __LINE__, expected[count], i);
3287             break;
3288         }
3289         count ++;
3290     }
3291     if (count != expectedcount) {
3292         printStringBreaks(ustr, expected, expectedcount);
3293         test->errln("%s:%d break forward test failed: missed %d match",
3294                     __FILE__, __LINE__, expectedcount - count);
3295         return;
3296     }
3297     // testing boundaries
3298     for (i = 1; i < expectedcount; i ++) {
3299         int j = expected[i - 1];
3300         if (!bi->isBoundary(j)) {
3301             printStringBreaks(ustr, expected, expectedcount);
3302             test->errln("%s:%d isBoundary() failed.  Expected boundary at position %d",
3303                     __FILE__, __LINE__, j);
3304             return;
3305         }
3306         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3307             if (bi->isBoundary(j)) {
3308                 printStringBreaks(ustr, expected, expectedcount);
3309                 test->errln("%s:%d isBoundary() failed.  Not expecting boundary at position %d",
3310                     __FILE__, __LINE__, j);
3311                 return;
3312             }
3313         }
3314     }
3315
3316     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3317         count --;
3318         if (forward[count] != i) {
3319             printStringBreaks(ustr, expected, expectedcount);
3320             test->errln("%s:%d happy break test previous() failed: expected %d but got %d",
3321                         __FILE__, __LINE__, forward[count], i);
3322             break;
3323         }
3324     }
3325     if (count != 0) {
3326         printStringBreaks(ustr, expected, expectedcount);
3327         test->errln("break test previous() failed: missed a match");
3328         return;
3329     }
3330
3331     // testing preceding
3332     for (i = 0; i < expectedcount - 1; i ++) {
3333         // int j = expected[i] + 1;
3334         int j = ustr.moveIndex32(expected[i], 1);
3335         for (; j <= expected[i + 1]; j ++) {
3336             int32_t expectedPreceding = expected[i];
3337             int32_t actualPreceding = bi->preceding(j);
3338             if (actualPreceding != expectedPreceding) {
3339                 printStringBreaks(ustr, expected, expectedcount);
3340                 test->errln("%s:%d preceding(%d): expected %d, got %d",
3341                         __FILE__, __LINE__, j, expectedPreceding, actualPreceding);
3342                 return;
3343             }
3344         }
3345     }
3346 }
3347 #endif
3348
3349 void RBBITest::TestWordBreaks(void)
3350 {
3351 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3352
3353     Locale        locale("en");
3354     UErrorCode    status = U_ZERO_ERROR;
3355     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3356     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3357     // Replaced any C+J characters in a row with a random sequence of characters
3358     // of the same length to make our C+J segmentation not get in the way.
3359     static const char *strlist[] =
3360     {
3361     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3362     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3363     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3364     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3365     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3366     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3367     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3368     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3369     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3370     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3371     "\\u2027\\U000e0067\\u0a47\\u00b7",
3372     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3373     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3374     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3375     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3376     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3377     "\\u0027\\u11af\\U000e0057\\u0602",
3378     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3379     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3380     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3381     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3382     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3383     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3384     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3385     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3386     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3387     "\\u18f4\\U000e0049\\u20e7\\u2027",
3388     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3389     "\\ua183\\u102d\\u0bec\\u003a",
3390     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3391     "\\u003a\\u0e57\\u0fad\\u002e",
3392     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3393     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3394     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3395     "\\u003a\\u0664\\u00b7\\u1fba",
3396     "\\u003b\\u0027\\u00b7\\u47a3",
3397     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3398     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3399     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3400     };
3401     int loop;
3402     if (U_FAILURE(status)) {
3403         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3404         return;
3405     }
3406     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3407         // printf("looping %d\n", loop);
3408         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3409         // RBBICharMonkey monkey;
3410         RBBIWordMonkey monkey;
3411
3412         int expected[50];
3413         int expectedcount = 0;
3414
3415         monkey.setText(ustr);
3416         int i;
3417         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3418             expected[expectedcount ++] = i;
3419         }
3420
3421         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3422     }
3423     delete bi;
3424 #endif
3425 }
3426
3427 void RBBITest::TestWordBoundary(void)
3428 {
3429     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3430     Locale        locale("en");
3431     UErrorCode    status = U_ZERO_ERROR;
3432     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3433     LocalPointer<BreakIterator> bi(BreakIterator::createWordInstance(locale, status), status);
3434     if (U_FAILURE(status)) {
3435         errcheckln(status, "%s:%d Creation of break iterator failed %s",
3436                 __FILE__, __LINE__, u_errorName(status));
3437         return;
3438     }
3439     UChar         str[50];
3440     static const char *strlist[] =
3441     {
3442     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3443     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3444     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3445     "\\u2027\\U000e0067\\u0a47\\u00b7",
3446     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3447     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3448     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3449     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3450     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3451     "\\u0027\\u11af\\U000e0057\\u0602",
3452     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3453     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3454     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3455     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3456     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3457     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3458     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3459     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3460     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3461     "\\u58f4\\U000e0049\\u20e7\\u2027",
3462     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3463     "\\ua183\\u102d\\u0bec\\u003a",
3464     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3465     "\\u003a\\u0e57\\u0fad\\u002e",
3466     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3467     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3468     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3469     "\\u003a\\u0664\\u00b7\\u1fba",
3470     "\\u003b\\u0027\\u00b7\\u47a3",
3471     };
3472     int loop;
3473     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3474         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3475         UnicodeString ustr(str);
3476         int forward[50];
3477         int count = 0;
3478
3479         bi->setText(ustr);
3480         int prev = -1;
3481         for (int32_t boundary = bi->first(); boundary != BreakIterator::DONE; boundary = bi->next()) {
3482             ++count;
3483             if (count >= UPRV_LENGTHOF(forward)) {
3484                 errln("%s:%d too many breaks found. (loop, count, boundary) = (%d, %d, %d)",
3485                         __FILE__, __LINE__, loop, count, boundary);
3486                 return;
3487             }
3488             forward[count] = boundary;
3489             if (boundary <= prev) {
3490                 errln("%s:%d bi::next() did not advance. (loop, prev, boundary) = (%d, %d, %d)\n",
3491                         __FILE__, __LINE__, loop, prev, boundary);
3492                 break;
3493             }
3494             for (int32_t nonBoundary = prev + 1; nonBoundary < boundary; nonBoundary ++) {
3495                 if (bi->isBoundary(nonBoundary)) {
3496                     printStringBreaks(ustr, forward, count);
3497                     errln("%s:%d isBoundary(nonBoundary) failed. (loop, prev, nonBoundary, boundary) = (%d, %d, %d, %d)",
3498                            __FILE__, __LINE__, loop, prev, nonBoundary, boundary);
3499                     return;
3500                 }
3501             }
3502             if (!bi->isBoundary(boundary)) {
3503                 printStringBreaks(ustr, forward, count);
3504                 errln("%s:%d happy boundary test failed: expected %d a boundary",
3505                        __FILE__, __LINE__, boundary);
3506                 return;
3507             }
3508             prev = boundary;
3509         }
3510     }
3511 }
3512
3513 void RBBITest::TestLineBreaks(void)
3514 {
3515 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3516     Locale        locale("en");
3517     UErrorCode    status = U_ZERO_ERROR;
3518     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3519     const int32_t  STRSIZE = 50;
3520     UChar         str[STRSIZE];
3521     static const char *strlist[] =
3522     {
3523      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3524      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3525              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3526      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3527              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3528      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3529      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3530      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3531      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3532      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3533      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3534      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3535      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3536      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3537      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3538      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3539      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3540      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3541      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3542      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3543      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3544      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3545      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3546      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3547      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3548      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3549      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3550      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3551      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3552      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3553      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3554      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3555      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3556      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3557      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3558      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3559      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3560      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3561      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3562          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3563     };
3564     int loop;
3565     TEST_ASSERT_SUCCESS(status);
3566     if (U_FAILURE(status)) {
3567         return;
3568     }
3569     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3570         // printf("looping %d\n", loop);
3571         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3572         if (t >= STRSIZE) {
3573             TEST_ASSERT(FALSE);
3574             continue;
3575         }
3576
3577
3578         UnicodeString ustr(str);
3579         RBBILineMonkey monkey;
3580         if (U_FAILURE(monkey.deferredStatus)) {
3581             continue;
3582         }
3583
3584         const int EXPECTEDSIZE = 50;
3585         int expected[EXPECTEDSIZE];
3586         int expectedcount = 0;
3587
3588         monkey.setText(ustr);
3589         int i;
3590         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3591             if (expectedcount >= EXPECTEDSIZE) {
3592                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3593                 return;
3594             }
3595             expected[expectedcount ++] = i;
3596         }
3597
3598         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3599     }
3600     delete bi;
3601 #endif
3602 }
3603
3604 void RBBITest::TestSentBreaks(void)
3605 {
3606 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3607     Locale        locale("en");
3608     UErrorCode    status = U_ZERO_ERROR;
3609     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3610     UChar         str[200];
3611     static const char *strlist[] =
3612     {
3613      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3614      "This\n",
3615      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3616      "\"Sentence ending with a quote.\" Bye.",
3617      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3618      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3619      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3620      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3621      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3622      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3623      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3624              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3625              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3626              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3627      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3628              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3629              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3630              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3631              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3632              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3633     };
3634     int loop;
3635     if (U_FAILURE(status)) {
3636         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3637         return;
3638     }
3639     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3640         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
3641         UnicodeString ustr(str);
3642
3643         RBBISentMonkey monkey;
3644         if (U_FAILURE(monkey.deferredStatus)) {
3645             continue;
3646         }
3647
3648         const int EXPECTEDSIZE = 50;
3649         int expected[EXPECTEDSIZE];
3650         int expectedcount = 0;
3651
3652         monkey.setText(ustr);
3653         int i;
3654         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3655             if (expectedcount >= EXPECTEDSIZE) {
3656                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3657                 return;
3658             }
3659             expected[expectedcount ++] = i;
3660         }
3661
3662         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3663     }
3664     delete bi;
3665 #endif
3666 }
3667
3668 void RBBITest::TestMonkey() {
3669 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3670
3671     UErrorCode     status    = U_ZERO_ERROR;
3672     int32_t        loopCount = 500;
3673     int32_t        seed      = 1;
3674     UnicodeString  breakType = "all";
3675     Locale         locale("en");
3676     UBool          useUText  = FALSE;
3677
3678     if (quick == FALSE) {
3679         loopCount = 10000;
3680     }
3681
3682     if (fTestParams) {
3683         UnicodeString p(fTestParams);
3684         loopCount = getIntParam("loop", p, loopCount);
3685         seed      = getIntParam("seed", p, seed);
3686
3687         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3688         if (m.find()) {
3689             breakType = m.group(1, status);
3690             m.reset();
3691             p = m.replaceFirst("", status);
3692         }
3693
3694         RegexMatcher u(" *utext", p, 0, status);
3695         if (u.find()) {
3696             useUText = TRUE;
3697             u.reset();
3698             p = u.replaceFirst("", status);
3699         }
3700
3701
3702         // m.reset(p);
3703         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3704             // Each option is stripped out of the option string as it is processed.
3705             // All options have been checked.  The option string should have been completely emptied..
3706             char buf[100];
3707             p.extract(buf, sizeof(buf), NULL, status);
3708             buf[sizeof(buf)-1] = 0;
3709             errln("Unrecognized or extra parameter:  %s\n", buf);
3710             return;
3711         }
3712
3713     }
3714
3715     if (breakType == "char" || breakType == "all") {
3716         RBBICharMonkey  m;
3717         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3718         if (U_SUCCESS(status)) {
3719             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3720             if (breakType == "all" && useUText==FALSE) {
3721                 // Also run a quick test with UText when "all" is specified
3722                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3723             }
3724         }
3725         else {
3726             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3727         }
3728         delete bi;
3729     }
3730
3731     if (breakType == "word" || breakType == "all") {
3732         logln("Word Break Monkey Test");
3733         RBBIWordMonkey  m;
3734         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3735         if (U_SUCCESS(status)) {
3736             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3737         }
3738         else {
3739             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3740         }
3741         delete bi;
3742     }
3743
3744     if (breakType == "line" || breakType == "all") {
3745         logln("Line Break Monkey Test");
3746         RBBILineMonkey  m;
3747         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3748         if (loopCount >= 10) {
3749             loopCount = loopCount / 5;   // Line break runs slower than the others.
3750         }
3751         if (U_SUCCESS(status)) {
3752             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3753         }
3754         else {
3755             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3756         }
3757         delete bi;
3758     }
3759
3760     if (breakType == "sent" || breakType == "all"  ) {
3761         logln("Sentence Break Monkey Test");
3762         RBBISentMonkey  m;
3763         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3764         if (loopCount >= 10) {
3765             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3766         }
3767         if (U_SUCCESS(status)) {
3768             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3769         }
3770         else {
3771             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3772         }
3773         delete bi;
3774     }
3775
3776 #endif
3777 }
3778
3779 //
3780 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3781 //    Parameters:
3782 //       bi      - the break iterator to use
3783 //       mk      - MonkeyKind, abstraction for obtaining expected results
3784 //       name    - Name of test (char, word, etc.) for use in error messages
3785 //       seed    - Seed for starting random number generator (parameter from user)
3786 //       numIterations
3787 //
3788 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3789                          int32_t numIterations, UBool useUText) {
3790
3791 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3792
3793     const int32_t    TESTSTRINGLEN = 500;
3794     UnicodeString    testText;
3795     int32_t          numCharClasses;
3796     UVector          *chClasses;
3797     int              expected[TESTSTRINGLEN*2 + 1];
3798     int              expectedCount = 0;
3799     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3800     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3801     char             reverseBreaks[TESTSTRINGLEN*2+1];
3802     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3803     char             followingBreaks[TESTSTRINGLEN*2+1];
3804     char             precedingBreaks[TESTSTRINGLEN*2+1];
3805     int              i;
3806     int              loopCount = 0;
3807
3808     m_seed = seed;
3809
3810     numCharClasses = mk.charClasses()->size();
3811     chClasses      = mk.charClasses();
3812
3813     // Check for errors that occured during the construction of the MonkeyKind object.
3814     //  Can't report them where they occured because errln() is a method coming from intlTest,
3815     //  and is not visible outside of RBBITest :-(
3816     if (U_FAILURE(mk.deferredStatus)) {
3817         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3818         return;
3819     }
3820
3821     // Verify that the character classes all have at least one member.
3822     for (i=0; i<numCharClasses; i++) {
3823         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3824         if (s == NULL || s->size() == 0) {
3825             errln("Character Class #%d is null or of zero size.", i);
3826             return;
3827         }
3828     }
3829
3830     while (loopCount < numIterations || numIterations == -1) {
3831         if (numIterations == -1 && loopCount % 10 == 0) {
3832             // If test is running in an infinite loop, display a periodic tic so
3833             //   we can tell that it is making progress.
3834             fprintf(stderr, ".");
3835         }
3836         // Save current random number seed, so that we can recreate the random numbers
3837         //   for this loop iteration in event of an error.
3838         seed = m_seed;
3839
3840         // Populate a test string with data.
3841         testText.truncate(0);
3842         for (i=0; i<TESTSTRINGLEN; i++) {
3843             int32_t  aClassNum = m_rand() % numCharClasses;
3844             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3845             int32_t   charIdx = m_rand() % classSet->size();
3846             UChar32   c = classSet->charAt(charIdx);
3847             if (c < 0) {   // TODO:  deal with sets containing strings.
3848                 errln("%s:%d c < 0", __FILE__, __LINE__);
3849                 break;
3850             }
3851             // Do not assemble a supplementary character from randomly generated separate surrogates.
3852             //   (It could be a dictionary character)
3853             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
3854                 continue;
3855             }
3856
3857             testText.append(c);
3858         }
3859
3860         // Calculate the expected results for this test string.
3861         mk.setText(testText);
3862         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3863         expectedBreaks[0] = 1;
3864         int32_t breakPos = 0;
3865         expectedCount = 0;
3866         for (;;) {
3867             breakPos = mk.next(breakPos);
3868             if (breakPos == -1) {
3869                 break;
3870             }
3871             if (breakPos > testText.length()) {
3872                 errln("breakPos > testText.length()");
3873             }
3874             expectedBreaks[breakPos] = 1;
3875             U_ASSERT(expectedCount<testText.length());
3876             expected[expectedCount ++] = breakPos;
3877             (void)expected;   // Set but not used warning.
3878                               // TODO (andy): check it out.
3879         }
3880
3881         // Find the break positions using forward iteration
3882         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3883         if (useUText) {
3884             UErrorCode status = U_ZERO_ERROR;
3885             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3886             // testUText = utext_openUnicodeString(testUText, &testText, &status);
3887             bi->setText(testUText, status);
3888             TEST_ASSERT_SUCCESS(status);
3889             utext_close(testUText);   // The break iterator does a shallow clone of the UText
3890                                       //  This UText can be closed immediately, so long as the
3891                                       //  testText string continues to exist.
3892         } else {
3893             bi->setText(testText);
3894         }
3895
3896         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
3897             if (i < 0 || i > testText.length()) {
3898                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3899                 break;
3900             }
3901             forwardBreaks[i] = 1;
3902         }
3903
3904         // Find the break positions using reverse iteration
3905         memset(reverseBreaks, 0, sizeof(reverseBreaks));
3906         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
3907             if (i < 0 || i > testText.length()) {
3908                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
3909                 break;
3910             }
3911             reverseBreaks[i] = 1;
3912         }
3913
3914         // Find the break positions using isBoundary() tests.
3915         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
3916         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
3917         for (i=0; i<=testText.length(); i++) {
3918             isBoundaryBreaks[i] = bi->isBoundary(i);
3919         }
3920
3921
3922         // Find the break positions using the following() function.
3923         // printf(".");
3924         memset(followingBreaks, 0, sizeof(followingBreaks));
3925         int32_t   lastBreakPos = 0;
3926         followingBreaks[0] = 1;
3927         for (i=0; i<testText.length(); i++) {
3928             breakPos = bi->following(i);
3929             if (breakPos <= i ||
3930                 breakPos < lastBreakPos ||
3931                 breakPos > testText.length() ||
3932                 (breakPos > lastBreakPos && lastBreakPos > i)) {
3933                 errln("%s break monkey test: "
3934                     "Out of range value returned by BreakIterator::following().\n"
3935                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
3936                          name, seed, i, breakPos, lastBreakPos);
3937                 break;
3938             }
3939             followingBreaks[breakPos] = 1;
3940             lastBreakPos = breakPos;
3941         }
3942
3943         // Find the break positions using the preceding() function.
3944         memset(precedingBreaks, 0, sizeof(precedingBreaks));
3945         lastBreakPos = testText.length();
3946         precedingBreaks[testText.length()] = 1;
3947         for (i=testText.length(); i>0; i--) {
3948             breakPos = bi->preceding(i);
3949             if (breakPos >= i ||
3950                 breakPos > lastBreakPos ||
3951                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
3952                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
3953                 errln("%s break monkey test: "
3954                     "Out of range value returned by BreakIterator::preceding().\n"
3955                     "index=%d;  prev returned %d; lastBreak=%d" ,
3956                     name,  i, breakPos, lastBreakPos);
3957                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
3958                     precedingBreaks[i] = 2;   // Forces an error.
3959                 }
3960             } else {
3961                 if (breakPos >= 0) {
3962                     precedingBreaks[breakPos] = 1;
3963                 }
3964                 lastBreakPos = breakPos;
3965             }
3966         }
3967
3968         // Compare the expected and actual results.
3969         for (i=0; i<=testText.length(); i++) {
3970             const char *errorType = NULL;
3971             if  (forwardBreaks[i] != expectedBreaks[i]) {
3972                 errorType = "next()";
3973             } else if (reverseBreaks[i] != forwardBreaks[i]) {
3974                 errorType = "previous()";
3975             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
3976                 errorType = "isBoundary()";
3977             } else if (followingBreaks[i] != expectedBreaks[i]) {
3978                 errorType = "following()";
3979             } else if (precedingBreaks[i] != expectedBreaks[i]) {
3980                 errorType = "preceding()";
3981             }
3982
3983
3984             if (errorType != NULL) {
3985                 // Format a range of the test text that includes the failure as
3986                 //  a data item that can be included in the rbbi test data file.
3987
3988                 // Start of the range is the last point where expected and actual results
3989                 //   both agreed that there was a break position.
3990                 int startContext = i;
3991                 int32_t count = 0;
3992                 for (;;) {
3993                     if (startContext==0) { break; }
3994                     startContext --;
3995                     if (expectedBreaks[startContext] != 0) {
3996                         if (count == 2) break;
3997                         count ++;
3998                     }
3999                 }
4000
4001                 // End of range is two expected breaks past the start position.
4002                 int endContext = i + 1;
4003                 int ci;
4004                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4005                     for (;;) {
4006                         if (endContext >= testText.length()) {break;}
4007                         if (expectedBreaks[endContext-1] != 0) {
4008                             if (count == 0) break;
4009                             count --;
4010                         }
4011                         endContext ++;
4012                     }
4013                 }
4014
4015                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4016                 UnicodeString errorText = "<data>";
4017                 /***if (strcmp(errorType, "next()") == 0) {
4018                     startContext = 0;
4019                     endContext = testText.length();
4020
4021                     printStringBreaks(testText, expected, expectedCount);
4022                 }***/
4023
4024                 for (ci=startContext; ci<endContext;) {
4025                     UnicodeString hexChars("0123456789abcdef");
4026                     UChar32  c;
4027                     int      bn;
4028                     c = testText.char32At(ci);
4029                     if (ci == i) {
4030                         // This is the location of the error.
4031                         errorText.append("<?>");
4032                     } else if (expectedBreaks[ci] != 0) {
4033                         // This a non-error expected break position.
4034                         errorText.append("\\");
4035                     }
4036                     if (c < 0x10000) {
4037                         errorText.append("\\u");
4038                         for (bn=12; bn>=0; bn-=4) {
4039                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4040                         }
4041                     } else {
4042                         errorText.append("\\U");
4043                         for (bn=28; bn>=0; bn-=4) {
4044                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4045                         }
4046                     }
4047                     ci = testText.moveIndex32(ci, 1);
4048                 }
4049                 errorText.append("\\");
4050                 errorText.append("</data>\n");
4051
4052                 // Output the error
4053                 char  charErrorTxt[500];
4054                 UErrorCode status = U_ZERO_ERROR;
4055                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4056                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4057                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4058
4059                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4060                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4061                     errorType, seed, i, charErrorTxt);
4062                 break;
4063             }
4064         }
4065
4066         loopCount++;
4067     }
4068 #endif
4069 }
4070
4071
4072 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4073 //             This test checks the initial patch,
4074 //             which is to just keep it from crashing.  Correct word boundaries
4075 //             await a proper fix to the dictionary code.
4076 //
4077 void RBBITest::TestBug5532(void)  {
4078    // Text includes a mixture of Thai and Latin.
4079    const unsigned char utf8Data[] = {
4080            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4081            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4082            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4083            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4084            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4085            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4086            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4087            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4088            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4089            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4090            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4091
4092     UErrorCode status = U_ZERO_ERROR;
4093     UText utext=UTEXT_INITIALIZER;
4094     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4095     TEST_ASSERT_SUCCESS(status);
4096
4097     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4098     TEST_ASSERT_SUCCESS(status);
4099     if (U_SUCCESS(status)) {
4100         bi->setText(&utext, status);
4101         TEST_ASSERT_SUCCESS(status);
4102
4103         int32_t breakCount = 0;
4104         int32_t previousBreak = -1;
4105         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4106             // For now, just make sure that the break iterator doesn't hang.
4107             TEST_ASSERT(previousBreak < bi->current());
4108             previousBreak = bi->current();
4109         }
4110         TEST_ASSERT(breakCount > 0);
4111     }
4112     delete bi;
4113     utext_close(&utext);
4114 }
4115
4116
4117 void RBBITest::TestBug9983(void)  {
4118     UnicodeString text = UnicodeString("\\u002A"  // * Other
4119                                        "\\uFF65"  //   Other
4120                                        "\\u309C"  //   Katakana
4121                                        "\\uFF9F"  //   Extend
4122                                        "\\uFF65"  //   Other
4123                                        "\\u0020"  //   Other
4124                                        "\\u0000").unescape();
4125
4126     UErrorCode status = U_ZERO_ERROR;
4127     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4128         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4129     TEST_ASSERT_SUCCESS(status);
4130     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4131         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4132     TEST_ASSERT_SUCCESS(status);
4133     if (U_FAILURE(status)) {
4134         return;
4135     }
4136     int32_t offset, rstatus, iterationCount;
4137
4138     brkiter->setText(text);
4139     brkiter->last();
4140     iterationCount = 0;
4141     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4142         iterationCount++;
4143         rstatus = brkiter->getRuleStatus();
4144         (void)rstatus;     // Suppress set but not used warning.
4145         if (iterationCount >= 10) {
4146            break;
4147         }
4148     }
4149     TEST_ASSERT(iterationCount == 6);
4150
4151     brkiterPOSIX->setText(text);
4152     brkiterPOSIX->last();
4153     iterationCount = 0;
4154     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4155         iterationCount++;
4156         rstatus = brkiterPOSIX->getRuleStatus();
4157         (void)rstatus;     // Suppress set but not used warning.
4158         if (iterationCount >= 10) {
4159            break;
4160         }
4161     }
4162     TEST_ASSERT(iterationCount == 6);
4163 }
4164
4165 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4166 //
4167 void RBBITest::TestBug7547() {
4168     UnicodeString rules;
4169     UErrorCode status = U_ZERO_ERROR;
4170     UParseError parseError;
4171     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4172     if (status != U_BRK_RULE_SYNTAX) {
4173         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4174     }
4175     if (parseError.line != 1 || parseError.offset != 0) {
4176         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4177     }
4178 }
4179
4180
4181 void RBBITest::TestBug12797() {
4182     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4183     UErrorCode status = U_ZERO_ERROR;
4184     UParseError parseError;
4185     RuleBasedBreakIterator bi(rules, parseError, status);
4186     if (U_FAILURE(status)) {
4187         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4188         return;
4189     }
4190     UnicodeString text = "abc";
4191     bi.setText(text);
4192     bi.first();
4193     int32_t boundary = bi.next();
4194     if (boundary != 3) {
4195         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4196     }
4197 }
4198
4199 void RBBITest::TestBug12918() {
4200     // This test triggers an assertion failure in dictbe.cpp
4201     const UChar *crasherString = u"\u3325\u4a16";
4202     UErrorCode status = U_ZERO_ERROR;
4203     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4204     if (U_FAILURE(status)) {
4205         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4206         return;
4207     }
4208     ubrk_first(iter);
4209     int32_t pos = 0;
4210     int32_t lastPos = -1;
4211     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4212         if (pos <= lastPos) {
4213             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4214             break;
4215         }
4216     }
4217     ubrk_close(iter);
4218 }
4219
4220 void RBBITest::TestBug12932() {
4221     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4222     UnicodeString ruleStr(
4223             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4224             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4225             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4226             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4227             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4228             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4229
4230     UErrorCode status = U_ZERO_ERROR;
4231     UParseError parseError;
4232     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4233     if (status != U_BRK_RULE_SYNTAX) {
4234         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4235                 __FILE__, __LINE__, u_errorName(status));
4236     }
4237 }
4238
4239
4240 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4241 //             remain undevided by ICU char, word and line break.
4242 void RBBITest::TestEmoji() {
4243 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4244     UErrorCode  status = U_ZERO_ERROR;
4245
4246     CharString testFileName;
4247     testFileName.append(IntlTest::getSourceTestData(status), status);
4248     testFileName.appendPathPart("emoji-test.txt", status);
4249     if (U_FAILURE(status)) {
4250         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4251         return;
4252     }
4253     logln("Opening data file %s\n", testFileName.data());
4254
4255     int    len;
4256     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4257     if (U_FAILURE(status) || testFile == NULL) {
4258         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4259         return;
4260     }
4261     UnicodeString testFileAsString(testFile, len);
4262     delete [] testFile;
4263
4264     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4265     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4266     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4267     int32_t lineNumber = 0;
4268
4269     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4270     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4271     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4272     if (U_FAILURE(status)) {
4273         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4274         return;
4275     }
4276
4277     while (lineMatcher.find()) {
4278         ++lineNumber;
4279         UnicodeString line = lineMatcher.group(status);
4280         hexMatcher.reset(line);
4281         UnicodeString testString;   // accumulates the emoji sequence.
4282         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4283             UnicodeString hex = hexMatcher.group(1, status);
4284             if (hex.length() > 8) {
4285                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4286                 break;
4287             }
4288             CharString hex8;
4289             hex8.appendInvariantChars(hex, status);
4290             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4291             if (c<=0x10ffff) {
4292                 testString.append(c);
4293             } else {
4294                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4295                         __FILE__, __LINE__, lineNumber, hex8.data());
4296                 break;
4297             }
4298         }
4299
4300         if (testString.length() > 1) {
4301             charBreaks->setText(testString);
4302             charBreaks->first();
4303             int32_t firstBreak = charBreaks->next();
4304             if (testString.length() != firstBreak) {
4305                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4306                         __FILE__, __LINE__, lineNumber, firstBreak);
4307             }
4308             wordBreaks->setText(testString);
4309             wordBreaks->first();
4310             firstBreak = wordBreaks->next();
4311             if (testString.length() != firstBreak) {
4312                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4313                         __FILE__, __LINE__, lineNumber, firstBreak);
4314             }
4315             lineBreaks->setText(testString);
4316             lineBreaks->first();
4317             firstBreak = lineBreaks->next();
4318             if (testString.length() != firstBreak) {
4319                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4320                         __FILE__, __LINE__, lineNumber, firstBreak);
4321             }
4322         }
4323     }
4324 #endif
4325 }
4326
4327
4328 // TestBug12519  -  Correct handling of Locales by assignment / copy / clone
4329
4330 // WHERE Macro yields a literal string of the form "source_file_name:line number "
4331 // TODO: propose something equivalent as a test framework addition.
4332
4333 #define WHERE __FILE__ ":" XLINE(__LINE__) " "
4334 #define XLINE(s) LINE(s)
4335 #define LINE(s) #s
4336
4337 void RBBITest::TestBug12519() {
4338     UErrorCode status = U_ZERO_ERROR;
4339     LocalPointer<RuleBasedBreakIterator> biEn((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4340     LocalPointer<RuleBasedBreakIterator> biFr((RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getFrance(), status));
4341     if (!assertSuccess(WHERE, status)) {
4342         dataerrln("%s %d status = %s", __FILE__, __LINE__, u_errorName(status));
4343         return;
4344     }
4345     assertTrue(WHERE, Locale::getEnglish() == biEn->getLocale(ULOC_VALID_LOCALE, status));
4346
4347     assertTrue(WHERE, Locale::getFrench() == biFr->getLocale(ULOC_VALID_LOCALE, status));
4348     assertTrue(WHERE "Locales do not participate in BreakIterator equality.", *biEn == *biFr);
4349
4350     LocalPointer<RuleBasedBreakIterator>cloneEn((RuleBasedBreakIterator *)biEn->clone());
4351     assertTrue(WHERE, *biEn == *cloneEn);
4352     assertTrue(WHERE, Locale::getEnglish() == cloneEn->getLocale(ULOC_VALID_LOCALE, status));
4353
4354     LocalPointer<RuleBasedBreakIterator>cloneFr((RuleBasedBreakIterator *)biFr->clone());
4355     assertTrue(WHERE, *biFr == *cloneFr);
4356     assertTrue(WHERE, Locale::getFrench() == cloneFr->getLocale(ULOC_VALID_LOCALE, status));
4357
4358     LocalPointer<RuleBasedBreakIterator>biDe((RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getGerman(), status));
4359     UnicodeString text("Hallo Welt");
4360     biDe->setText(text);
4361     assertTrue(WHERE "before assignment of \"biDe = biFr\", they should be different, but are equal.", *biFr != *biDe);
4362     *biDe = *biFr;
4363     assertTrue(WHERE "after assignment of \"biDe = biFr\", they should be equal, but are not.", *biFr == *biDe);
4364 }
4365
4366 void RBBITest::TestBug12677() {
4367     // Check that stripping of comments from rules for getRules() is not confused by
4368     // the presence of '#' characters in the rules that do not introduce comments.
4369     UnicodeString rules(u"!!forward; \n"
4370                          "$x = [ab#];  # a set with a # literal. \n"
4371                          " # .;        # a comment that looks sort of like a rule.   \n"
4372                          " '#' '?';    # a rule with a quoted #   \n"
4373                        );
4374
4375     UErrorCode status = U_ZERO_ERROR;
4376     UParseError pe;
4377     RuleBasedBreakIterator bi(rules, pe, status);
4378     assertSuccess(WHERE, status);
4379     UnicodeString rtRules = bi.getRules();
4380     assertEquals(WHERE, UnicodeString(u"!!forward; $x = [ab#]; '#' '?'; "),  rtRules);
4381 }
4382
4383
4384 void RBBITest::TestTableRedundancies() {
4385     UErrorCode status = U_ZERO_ERROR;
4386
4387     LocalPointer<RuleBasedBreakIterator> bi (
4388         (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status));
4389     assertSuccess(WHERE, status);
4390     if (U_FAILURE(status)) return;
4391
4392     RBBIDataWrapper *dw = bi->fData;
4393     const RBBIStateTable *fwtbl = dw->fForwardTable;
4394     int32_t numCharClasses = dw->fHeader->fCatCount;
4395     // printf("Char Classes: %d     states: %d\n", numCharClasses, fwtbl->fNumStates);
4396
4397     // Check for duplicate columns (character categories)
4398
4399     std::vector<UnicodeString> columns;
4400     for (int32_t column = 0; column < numCharClasses; column++) {
4401         UnicodeString s;
4402         for (int32_t r = 1; r < (int32_t)fwtbl->fNumStates; r++) {
4403             RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4404             s.append(row->fNextState[column]);
4405         }
4406         columns.push_back(s);
4407     }
4408     // Ignore column (char class) 0 while checking; it's special, and may have duplicates.
4409     for (int c1=1; c1<numCharClasses; c1++) {
4410         for (int c2 = c1+1; c2 < numCharClasses; c2++) {
4411             if (columns.at(c1) == columns.at(c2)) {
4412                 errln("%s:%d Duplicate columns (%d, %d)\n", __FILE__, __LINE__, c1, c2);
4413                 goto out;
4414             }
4415         }
4416     }
4417   out:
4418
4419     // Check for duplicate states
4420     std::vector<UnicodeString> rows;
4421     for (int32_t r=0; r < (int32_t)fwtbl->fNumStates; r++) {
4422         UnicodeString s;
4423         RBBIStateTableRow  *row = (RBBIStateTableRow *) (fwtbl->fTableData + (fwtbl->fRowLen * r));
4424         assertTrue(WHERE, row->fAccepting >= -1);
4425         s.append(row->fAccepting + 1);   // values of -1 are expected.
4426         s.append(row->fLookAhead);
4427         s.append(row->fTagIdx);
4428         for (int32_t column = 0; column < numCharClasses; column++) {
4429             s.append(row->fNextState[column]);
4430         }
4431         rows.push_back(s);
4432     }
4433     for (int r1=0; r1 < (int32_t)fwtbl->fNumStates; r1++) {
4434         for (int r2 = r1+1; r2 < (int32_t)fwtbl->fNumStates; r2++) {
4435             if (rows.at(r1) == rows.at(r2)) {
4436                 errln("%s:%d Duplicate rows (%d, %d)\n", __FILE__, __LINE__, r1, r2);
4437                 return;
4438             }
4439         }
4440     }
4441 }
4442
4443 // Bug 13447: verify that getRuleStatus() returns the value corresponding to current(),
4444 //            even after next() has returned DONE.
4445
4446 void RBBITest::TestBug13447() {
4447     UErrorCode status = U_ZERO_ERROR;
4448     LocalPointer<RuleBasedBreakIterator> bi(
4449         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status));
4450     assertSuccess(WHERE, status);
4451     if (U_FAILURE(status)) return;
4452     UnicodeString data(u"1234");
4453     bi->setText(data);
4454     assertEquals(WHERE, UBRK_WORD_NONE, bi->getRuleStatus());
4455     assertEquals(WHERE, 4, bi->next());
4456     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4457     assertEquals(WHERE, UBRK_DONE, bi->next());
4458     assertEquals(WHERE, 4, bi->current());
4459     assertEquals(WHERE, UBRK_WORD_NUMBER, bi->getRuleStatus());
4460 }
4461
4462 //  TestReverse exercises both the synthesized safe reverse rules and the logic
4463 //  for filling the break iterator cache when starting from random positions
4464 //  in the text.
4465 //
4466 //  It's a monkey test, working on random data, with the expected data obtained
4467 //  from forward iteration (no safe rules involved), comparing with results
4468 //  when indexing into the interior of the string (safe rules needed).
4469
4470 void RBBITest::TestReverse() {
4471     UErrorCode status = U_ZERO_ERROR;
4472
4473     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4474             BreakIterator::createCharacterInstance(Locale::getEnglish(), status)));
4475     assertSuccess(WHERE, status, true);
4476     status = U_ZERO_ERROR;
4477     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4478             BreakIterator::createWordInstance(Locale::getEnglish(), status)));
4479     assertSuccess(WHERE, status, true);
4480     status = U_ZERO_ERROR;
4481     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4482             BreakIterator::createLineInstance(Locale::getEnglish(), status)));
4483     assertSuccess(WHERE, status, true);
4484     status = U_ZERO_ERROR;
4485     TestReverse(std::unique_ptr<RuleBasedBreakIterator>((RuleBasedBreakIterator *)
4486             BreakIterator::createSentenceInstance(Locale::getEnglish(), status)));
4487     assertSuccess(WHERE, status, true);
4488 }
4489
4490 void RBBITest::TestReverse(std::unique_ptr<RuleBasedBreakIterator>bi) {
4491     if (!bi) {
4492         return;
4493     }
4494
4495     // From the mapping trie in the break iterator's internal data, create a
4496     // vector of UnicodeStrings, one for each character category, containing
4497     // all of the code points that map to that category. Unicode planes 0 and 1 only,
4498     // to avoid an execess of unassigned code points.
4499
4500     RBBIDataWrapper *data = bi->fData;
4501     int32_t categoryCount = data->fHeader->fCatCount;
4502     UTrie2  *trie = data->fTrie;
4503
4504     std::vector<UnicodeString> strings(categoryCount, UnicodeString());
4505     for (int cp=0; cp<0x1fff0; ++cp) {
4506         int cat = utrie2_get32(trie, cp);
4507         cat &= ~0x4000;    // And off the dictionary bit from the category.
4508         assertTrue(WHERE, cat < categoryCount && cat >= 0);
4509         if (cat < 0 || cat >= categoryCount) return;
4510         strings[cat].append(cp);
4511     }
4512
4513     icu_rand randomGen;
4514     const int testStringLength = 10000;
4515     UnicodeString testString;
4516
4517     for (int i=0; i<testStringLength; ++i) {
4518         int charClass = randomGen() % categoryCount;
4519         if (strings[charClass].length() > 0) {
4520             int cp = strings[charClass].char32At(randomGen() % strings[charClass].length());
4521             testString.append(cp);
4522         }
4523     }
4524
4525     typedef std::pair<UBool, int32_t> Result;
4526     std::vector<Result> expectedResults;
4527     bi->setText(testString);
4528     for (int i=0; i<testString.length(); ++i) {
4529         bool isboundary = bi->isBoundary(i);
4530         int  ruleStatus = bi->getRuleStatus();
4531         expectedResults.push_back(std::make_pair(isboundary, ruleStatus));
4532     }
4533
4534     for (int i=testString.length()-1; i>=0; --i) {
4535         bi->setText(testString);   // clears the internal break cache
4536         Result expected = expectedResults[i];
4537         assertEquals(WHERE, expected.first, bi->isBoundary(i));
4538         assertEquals(WHERE, expected.second, bi->getRuleStatus());
4539     }
4540 }
4541
4542
4543 // Ticket 13692 - finding word boundaries in very large numbers or words could
4544 //                be very time consuming. When the problem was present, this void test
4545 //                would run more than fifteen minutes, which is to say, the failure was noticeale.
4546
4547 void RBBITest::TestBug13692() {
4548     UErrorCode status = U_ZERO_ERROR;
4549     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4550             BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4551     if (!assertSuccess(WHERE, status, true)) {
4552         return;
4553     }
4554     constexpr int32_t LENGTH = 1000000;
4555     UnicodeString longNumber(LENGTH, (UChar32)u'3', LENGTH);
4556     for (int i=0; i<20; i+=2) {
4557         longNumber.setCharAt(i, u' ');
4558     }
4559     bi->setText(longNumber);
4560     assertFalse(WHERE, bi->isBoundary(LENGTH-5));
4561     assertSuccess(WHERE, status);
4562 }
4563
4564 //
4565 //  TestDebug    -  A place-holder test for debugging purposes.
4566 //                  For putting in fragments of other tests that can be invoked
4567 //                  for tracing  without a lot of unwanted extra stuff happening.
4568 //
4569 void RBBITest::TestDebug(void) {
4570     UErrorCode status = U_ZERO_ERROR;
4571     LocalPointer<RuleBasedBreakIterator> bi ((RuleBasedBreakIterator *)
4572             BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4573     if (!assertSuccess(WHERE, status, true)) {
4574         return;
4575     }
4576     const UnicodeString &rules = bi->getRules();
4577     UParseError pe;
4578     LocalPointer<RuleBasedBreakIterator> newbi(new RuleBasedBreakIterator(rules, pe, status));
4579     assertSuccess(WHERE, status);
4580 }
4581
4582 void RBBITest::TestProperties() {
4583     UErrorCode errorCode = U_ZERO_ERROR;
4584     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4585     if (!prependSet.isEmpty()) {
4586         errln(
4587             "[:GCB=Prepend:] is not empty any more. "
4588             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4589             "change this test to the opposite condition.");
4590     }
4591 }
4592
4593 #endif // #if !UCONFIG_NO_BREAK_ITERATION