icuSources/test/intltest/rbbitst.cpp

   1 // © 2016 and later: Unicode, Inc. and others.
   2 // License & terms of use: http://www.unicode.org/copyright.html
   3 /********************************************************************
   4  * COPYRIGHT:
   5  * Copyright (c) 1999-2016, International Business Machines Corporation and
   6  * others. All Rights Reserved.
   7  ********************************************************************/
   8 /************************************************************************
   9 *   Date        Name        Description
  10 *   12/15/99    Madhu        Creation.
  11 *   01/12/2000  Madhu        Updated for changed API and added new tests
  12 ************************************************************************/
  13
  14 #include "unicode/utypes.h"
  15 #if !UCONFIG_NO_BREAK_ITERATION
  16
  17 #include <stdio.h>
  18 #include <stdlib.h>
  19 #include <string.h>
  20
  21 #include "unicode/brkiter.h"
  22 #include "unicode/localpointer.h"
  23 #include "unicode/numfmt.h"
  24 #include "unicode/rbbi.h"
  25 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  26 #include "unicode/regex.h"
  27 #endif
  28 #include "unicode/schriter.h"
  29 #include "unicode/uchar.h"
  30 #include "unicode/utf16.h"
  31 #include "unicode/ucnv.h"
  32 #include "unicode/uniset.h"
  33 #include "unicode/uscript.h"
  34 #include "unicode/ustring.h"
  35 #include "unicode/utext.h"
  36
  37 #include "charstr.h"
  38 #include "cmemory.h"
  39 #include "cstr.h"
  40 #include "intltest.h"
  41 #include "rbbitst.h"
  42 #include "utypeinfo.h"  // for 'typeid' to work
  43 #include "uvector.h"
  44 #include "uvectr32.h"
  45
  46 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  47 #include "unicode/filteredbrk.h"
  48 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  49
  50 #define TEST_ASSERT(x) {if (!(x)) { \
  51     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  52
  53 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  54     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  55
  56
  57 //---------------------------------------------
  58 // runIndexedTest
  59 //---------------------------------------------
  60
  61
  62 //  Note:  Before adding new tests to this file, check whether the desired test data can
  63 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  64 //         it's much less work than writing a new test, diagnostic output in the event of failures
  65 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  66 //         will run there as well, without additional effort.
  67
  68 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  69 {
  70     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  71     fTestParams = params;
  72
  73     TESTCASE_AUTO_BEGIN;
  74 #if !UCONFIG_NO_FILE_IO
  75     TESTCASE_AUTO(TestBug4153072);
  76 #endif
  77     TESTCASE_AUTO(TestStatusReturn);
  78 #if !UCONFIG_NO_FILE_IO
  79     TESTCASE_AUTO(TestUnicodeFiles);
  80     TESTCASE_AUTO(TestEmptyString);
  81 #endif
  82     TESTCASE_AUTO(TestGetAvailableLocales);
  83     TESTCASE_AUTO(TestGetDisplayName);
  84 #if !UCONFIG_NO_FILE_IO
  85     TESTCASE_AUTO(TestEndBehaviour);
  86     TESTCASE_AUTO(TestWordBreaks);
  87     TESTCASE_AUTO(TestWordBoundary);
  88     TESTCASE_AUTO(TestLineBreaks);
  89     TESTCASE_AUTO(TestSentBreaks);
  90     TESTCASE_AUTO(TestExtended);
  91 #endif
  92 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
  93     TESTCASE_AUTO(TestMonkey);
  94 #endif
  95 #if !UCONFIG_NO_FILE_IO
  96     TESTCASE_AUTO(TestBug3818);
  97 #endif
  98     TESTCASE_AUTO(TestDebug);
  99 #if !UCONFIG_NO_FILE_IO
 100     TESTCASE_AUTO(TestBug5775);
 101 #endif
 102     TESTCASE_AUTO(TestBug9983);
 103     TESTCASE_AUTO(TestDictRules);
 104     TESTCASE_AUTO(TestBug5532);
 105     TESTCASE_AUTO(TestBug7547);
 106     TESTCASE_AUTO(TestBug12797);
 107     TESTCASE_AUTO(TestBug12918);
 108     TESTCASE_AUTO(TestBug12932);
 109     TESTCASE_AUTO(TestEmoji);
 110     TESTCASE_AUTO_END;
 111 }
 112
 113
 114 //---------------------------------------------------------------------------
 115 //
 116 //   class BITestData   Holds a set of Break iterator test data and results
 117 //                      Includes
 118 //                         - the string data to be broken
 119 //                         - a vector of the expected break positions.
 120 //                         - a vector of source line numbers for the data,
 121 //                               (to help see where errors occured.)
 122 //                         - The expected break tag values.
 123 //                         - Vectors of actual break positions and tag values.
 124 //                         - Functions for comparing actual with expected and
 125 //                            reporting errors.
 126 //
 127 //----------------------------------------------------------------------------
 128 class BITestData {
 129 public:
 130     UnicodeString    fDataToBreak;
 131     UVector          fExpectedBreakPositions;
 132     UVector          fExpectedTags;
 133     UVector          fLineNum;
 134     UVector          fActualBreakPositions;   // Test Results.
 135     UVector          fActualTags;
 136
 137     BITestData(UErrorCode &status);
 138     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 139     void             checkResults(const char *heading, RBBITest *test);
 140     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 141     void             clearResults();
 142 };
 143
 144 //
 145 // Constructor.
 146 //
 147 BITestData::BITestData(UErrorCode &status)
 148 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 149   fActualTags(status)
 150 {
 151 }
 152
 153 //
 154 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 155 //                 The macro form collects the line number, which is helpful
 156 //                 when tracking down failures.
 157 //
 158 //                 A null data item is inserted at the start of each test's data
 159 //                  to put the starting zero into the data list.  The position saved for
 160 //                  each non-null item is its ending position.
 161 //
 162 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 163 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 164     if (U_FAILURE(status)) {return;}
 165     if (data != NULL) {
 166         fDataToBreak.append(CharsToUnicodeString(data));
 167     }
 168     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 169     fExpectedTags.addElement(tag, status);
 170     fLineNum.addElement(lineNum, status);
 171 }
 172
 173
 174 //
 175 //  checkResults.   Compare the actual and expected break positions, report any differences.
 176 //
 177 void BITestData::checkResults(const char *heading, RBBITest *test) {
 178     int32_t   expectedIndex = 0;
 179     int32_t   actualIndex = 0;
 180
 181     for (;;) {
 182         // If we've run through both the expected and actual results vectors, we're done.
 183         //   break out of the loop.
 184         if (expectedIndex >= fExpectedBreakPositions.size() &&
 185             actualIndex   >= fActualBreakPositions.size()) {
 186             break;
 187         }
 188
 189
 190         if (expectedIndex >= fExpectedBreakPositions.size()) {
 191             err(heading, test, expectedIndex-1, actualIndex);
 192             actualIndex++;
 193             continue;
 194         }
 195
 196         if (actualIndex >= fActualBreakPositions.size()) {
 197             err(heading, test, expectedIndex, actualIndex-1);
 198             expectedIndex++;
 199             continue;
 200         }
 201
 202         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 203             err(heading, test, expectedIndex, actualIndex);
 204             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 205             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 206                 actualIndex++;
 207             } else {
 208                 expectedIndex++;
 209             }
 210             continue;
 211         }
 212
 213         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 214             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 215                 heading, fLineNum.elementAt(expectedIndex),
 216                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 217         }
 218
 219         actualIndex++;
 220         expectedIndex++;
 221     }
 222 }
 223
 224 //
 225 //  err   -  An error was found.  Report it, along with information about where the
 226 //                                incorrectly broken test data appeared in the source file.
 227 //
 228 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 229 {
 230     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 231     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 232     int32_t   o        = 0;
 233     int32_t   line     = fLineNum.elementAti(expectedIdx);
 234     if (expectedIdx > 0) {
 235         // The line numbers are off by one because a premature break occurs somewhere
 236         //    within the previous item, rather than at the start of the current (expected) item.
 237         //    We want to report the offset of the unexpected break from the start of
 238         //      this previous item.
 239         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 240     }
 241     if (actual < expected) {
 242         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 243     } else {
 244         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 245     }
 246 }
 247
 248
 249 void BITestData::clearResults() {
 250     fActualBreakPositions.removeAllElements();
 251     fActualTags.removeAllElements();
 252 }
 253
 254
 255 //--------------------------------------------------------------------------------------
 256 //
 257 //    RBBITest    constructor and destructor
 258 //
 259 //--------------------------------------------------------------------------------------
 260
 261 RBBITest::RBBITest() {
 262     fTestParams = NULL;
 263 }
 264
 265
 266 RBBITest::~RBBITest() {
 267 }
 268
 269 //-----------------------------------------------------------------------------------
 270 //
 271 //   Test for status {tag} return value from break rules.
 272 //        TODO:  a more thorough test.
 273 //
 274 //-----------------------------------------------------------------------------------
 275 void RBBITest::TestStatusReturn() {
 276      UnicodeString rulesString1("$Letters = [:L:];\n"
 277                                   "$Numbers = [:N:];\n"
 278                                   "$Letters+{1};\n"
 279                                   "$Numbers+{2};\n"
 280                                   "Help\\ /me\\!{4};\n"
 281                                   "[^$Letters $Numbers];\n"
 282                                   "!.*;\n", -1, US_INV);
 283      UnicodeString testString1  = "abc123..abc Help me Help me!";
 284                                 // 01234567890123456789012345678
 285      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 286      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 287
 288      UErrorCode status=U_ZERO_ERROR;
 289      UParseError    parseError;
 290
 291      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
 292      if(U_FAILURE(status)) {
 293          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
 294          return;
 295      }
 296      int32_t  pos;
 297      int32_t  i = 0;
 298      bi->setText(testString1);
 299      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 300          if (pos != bounds1[i]) {
 301              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
 302              break;
 303          }
 304
 305          int tag = bi->getRuleStatus();
 306          if (tag != brkStatus[i]) {
 307              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
 308              break;
 309          }
 310          i++;
 311      }
 312 }
 313
 314
 315 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 316     UErrorCode status = U_ZERO_ERROR;
 317     char name[100];
 318     printf("code    alpha extend alphanum type word sent line name\n");
 319     int nextExpectedIndex = 0;
 320     utext_setNativeIndex(tstr, 0);
 321     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 322         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 323             printf("------------------------------------------------ %d\n", j);
 324             ++nextExpectedIndex;
 325         }
 326
 327         UChar32 c = utext_next32(tstr);
 328         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 329         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 330                            u_isUAlphabetic(c),
 331                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 332                            u_isalnum(c),
 333                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 334                                                   u_charType(c),
 335                                                   U_SHORT_PROPERTY_NAME),
 336                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 337                                                   u_getIntPropertyValue(c,
 338                                                           UCHAR_WORD_BREAK),
 339                                                   U_SHORT_PROPERTY_NAME),
 340                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 341                                    u_getIntPropertyValue(c,
 342                                            UCHAR_SENTENCE_BREAK),
 343                                    U_SHORT_PROPERTY_NAME),
 344                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 345                                    u_getIntPropertyValue(c,
 346                                            UCHAR_LINE_BREAK),
 347                                    U_SHORT_PROPERTY_NAME),
 348                            name);
 349     }
 350 }
 351
 352
 353 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 354    UErrorCode status = U_ZERO_ERROR;
 355    UText *tstr = NULL;
 356    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 357    if (U_FAILURE(status)) {
 358        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 359        return;
 360     }
 361    printStringBreaks(tstr, expected, expectedCount);
 362    utext_close(tstr);
 363 }
 364
 365
 366 void RBBITest::TestBug3818() {
 367     UErrorCode  status = U_ZERO_ERROR;
 368
 369     // Four Thai words...
 370     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 371                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 372     UnicodeString  thaiStr(thaiWordData);
 373
 374     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 375     if (U_FAILURE(status) || bi == NULL) {
 376         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 377         return;
 378     }
 379     bi->setText(thaiStr);
 380
 381     int32_t  startOfSecondWord = bi->following(1);
 382     if (startOfSecondWord != 4) {
 383         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 384             __FILE__, __LINE__, startOfSecondWord);
 385     }
 386     startOfSecondWord = bi->following(0);
 387     if (startOfSecondWord != 4) {
 388         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 389             __FILE__, __LINE__, startOfSecondWord);
 390     }
 391     delete bi;
 392 }
 393
 394 //----------------------------------------------------------------------------
 395 //
 396 // generalIteratorTest      Given a break iterator and a set of test data,
 397 //                          Run the tests and report the results.
 398 //
 399 //----------------------------------------------------------------------------
 400 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 401 {
 402
 403     bi.setText(td.fDataToBreak);
 404
 405     testFirstAndNext(bi, td);
 406
 407     testLastAndPrevious(bi, td);
 408
 409     testFollowing(bi, td);
 410     testPreceding(bi, td);
 411     testIsBoundary(bi, td);
 412     doMultipleSelectionTest(bi, td);
 413 }
 414
 415
 416 //
 417 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 418 //                       kind of loop.
 419 //
 420 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 421 {
 422     UErrorCode  status = U_ZERO_ERROR;
 423     int32_t     p;
 424     int32_t     lastP = -1;
 425     int32_t     tag;
 426
 427     logln("Test first and next");
 428     bi.setText(td.fDataToBreak);
 429     td.clearResults();
 430
 431     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 432         td.fActualBreakPositions.addElement(p, status);  // Save result.
 433         tag = bi.getRuleStatus();
 434         td.fActualTags.addElement(tag, status);
 435         if (p <= lastP) {
 436             // If the iterator is not making forward progress, stop.
 437             //  No need to raise an error here, it'll be detected in the normal check of results.
 438             break;
 439         }
 440         lastP = p;
 441     }
 442     td.checkResults("testFirstAndNext", this);
 443 }
 444
 445
 446 //
 447 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 448 //
 449 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 450 {
 451     UErrorCode  status = U_ZERO_ERROR;
 452     int32_t     p;
 453     int32_t     lastP  = 0x7ffffffe;
 454     int32_t     tag;
 455
 456     logln("Test last and previous");
 457     bi.setText(td.fDataToBreak);
 458     td.clearResults();
 459
 460     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 461         // Save break position.  Insert it at start of vector of results, shoving
 462         //    already-saved results further towards the end.
 463         td.fActualBreakPositions.insertElementAt(p, 0, status);
 464         // bi.previous();   // TODO:  Why does this fix things up????
 465         // bi.next();
 466         tag = bi.getRuleStatus();
 467         td.fActualTags.insertElementAt(tag, 0, status);
 468         if (p >= lastP) {
 469             // If the iterator is not making progress, stop.
 470             //  No need to raise an error here, it'll be detected in the normal check of results.
 471             break;
 472         }
 473         lastP = p;
 474     }
 475     td.checkResults("testLastAndPrevious", this);
 476 }
 477
 478
 479 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 480 {
 481     UErrorCode  status = U_ZERO_ERROR;
 482     int32_t     p;
 483     int32_t     tag;
 484     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 485                                  //   cannot be -1; that is returned for DONE.
 486     int         i;
 487
 488     logln("testFollowing():");
 489     bi.setText(td.fDataToBreak);
 490     td.clearResults();
 491
 492     // Save the starting point, since we won't get that out of following.
 493     p = bi.first();
 494     td.fActualBreakPositions.addElement(p, status);  // Save result.
 495     tag = bi.getRuleStatus();
 496     td.fActualTags.addElement(tag, status);
 497
 498     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 499         p = bi.following(i);
 500         if (p != lastP) {
 501             if (p == RuleBasedBreakIterator::DONE) {
 502                 break;
 503             }
 504             // We've reached a new break position.  Save it.
 505             td.fActualBreakPositions.addElement(p, status);  // Save result.
 506             tag = bi.getRuleStatus();
 507             td.fActualTags.addElement(tag, status);
 508             lastP = p;
 509         }
 510     }
 511     // The loop normally exits by means of the break in the middle.
 512     // Make sure that the index was at the correct position for the break iterator to have
 513     //   returned DONE.
 514     if (i != td.fDataToBreak.length()) {
 515         errln("testFollowing():  iterator returned DONE prematurely.");
 516     }
 517
 518     // Full check of all results.
 519     td.checkResults("testFollowing", this);
 520 }
 521
 522
 523
 524 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 525     UErrorCode  status = U_ZERO_ERROR;
 526     int32_t     p;
 527     int32_t     tag;
 528     int32_t     lastP  = 0x7ffffffe;
 529     int         i;
 530
 531     logln("testPreceding():");
 532     bi.setText(td.fDataToBreak);
 533     td.clearResults();
 534
 535     p = bi.last();
 536     td.fActualBreakPositions.addElement(p, status);
 537     tag = bi.getRuleStatus();
 538     td.fActualTags.addElement(tag, status);
 539
 540     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 541         p = bi.preceding(i);
 542         if (p != lastP) {
 543             if (p == RuleBasedBreakIterator::DONE) {
 544                 break;
 545             }
 546             // We've reached a new break position.  Save it.
 547             td.fActualBreakPositions.insertElementAt(p, 0, status);
 548             lastP = p;
 549             tag = bi.getRuleStatus();
 550             td.fActualTags.insertElementAt(tag, 0, status);
 551         }
 552     }
 553     // The loop normally exits by means of the break in the middle.
 554     // Make sure that the index was at the correct position for the break iterator to have
 555     //   returned DONE.
 556     if (i != 0) {
 557         errln("testPreceding():  iterator returned DONE prematurely.");
 558     }
 559
 560     // Full check of all results.
 561     td.checkResults("testPreceding", this);
 562 }
 563
 564
 565
 566 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 567     UErrorCode  status = U_ZERO_ERROR;
 568     int         i;
 569     int32_t     tag;
 570
 571     logln("testIsBoundary():");
 572     bi.setText(td.fDataToBreak);
 573     td.clearResults();
 574
 575     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 576         if (bi.isBoundary(i)) {
 577             td.fActualBreakPositions.addElement(i, status);  // Save result.
 578             tag = bi.getRuleStatus();
 579             td.fActualTags.addElement(tag, status);
 580         }
 581     }
 582     td.checkResults("testIsBoundary: ", this);
 583 }
 584
 585
 586
 587 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 588 {
 589     iterator.setText(td.fDataToBreak);
 590
 591     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 592     int32_t offset = iterator.first();
 593     int32_t testOffset;
 594     int32_t count = 0;
 595
 596     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 597
 598     if (*testIterator != iterator)
 599         errln("clone() or operator!= failed: two clones compared unequal");
 600
 601     do {
 602         testOffset = testIterator->first();
 603         testOffset = testIterator->next(count);
 604         if (offset != testOffset)
 605             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 606
 607         if (offset != RuleBasedBreakIterator::DONE) {
 608             count++;
 609             offset = iterator.next();
 610
 611             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 612                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 613                 if (count > 10000 || offset == -1) {
 614                     errln("operator== failed too many times. Stopping test.");
 615                     if (offset == -1) {
 616                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 617                     }
 618                     return;
 619                 }
 620             }
 621         }
 622     } while (offset != RuleBasedBreakIterator::DONE);
 623
 624     // now do it backwards...
 625     offset = iterator.last();
 626     count = 0;
 627
 628     do {
 629         testOffset = testIterator->last();
 630         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 631         if (offset != testOffset)
 632             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 633
 634         if (offset != RuleBasedBreakIterator::DONE) {
 635             count--;
 636             offset = iterator.previous();
 637         }
 638     } while (offset != RuleBasedBreakIterator::DONE);
 639
 640     delete testIterator;
 641 }
 642
 643
 644 //---------------------------------------------
 645 //
 646 //     other tests
 647 //
 648 //---------------------------------------------
 649 void RBBITest::TestEmptyString()
 650 {
 651     UnicodeString text = "";
 652     UErrorCode status = U_ZERO_ERROR;
 653
 654     BITestData x(status);
 655     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 656     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 657     if (U_FAILURE(status))
 658     {
 659         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 660         return;
 661     }
 662     generalIteratorTest(*bi, x);
 663     delete bi;
 664 }
 665
 666 void RBBITest::TestGetAvailableLocales()
 667 {
 668     int32_t locCount = 0;
 669     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 670
 671     if (locCount == 0)
 672         dataerrln("getAvailableLocales() returned an empty list!");
 673     // Just make sure that it's returning good memory.
 674     int32_t i;
 675     for (i = 0; i < locCount; ++i) {
 676         logln(locList[i].getName());
 677     }
 678 }
 679
 680 //Testing the BreakIterator::getDisplayName() function
 681 void RBBITest::TestGetDisplayName()
 682 {
 683     UnicodeString   result;
 684
 685     BreakIterator::getDisplayName(Locale::getUS(), result);
 686     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 687         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 688                 + result);
 689
 690     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 691     if (result != "French (France)")
 692         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 693                 + result);
 694 }
 695 /**
 696  * Test End Behaviour
 697  * @bug 4068137
 698  */
 699 void RBBITest::TestEndBehaviour()
 700 {
 701     UErrorCode status = U_ZERO_ERROR;
 702     UnicodeString testString("boo.");
 703     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 704     if (U_FAILURE(status))
 705     {
 706         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 707         return;
 708     }
 709     wb->setText(testString);
 710
 711     if (wb->first() != 0)
 712         errln("Didn't get break at beginning of string.");
 713     if (wb->next() != 3)
 714         errln("Didn't get break before period in \"boo.\"");
 715     if (wb->current() != 4 && wb->next() != 4)
 716         errln("Didn't get break at end of string.");
 717     delete wb;
 718 }
 719 /*
 720  * @bug 4153072
 721  */
 722 void RBBITest::TestBug4153072() {
 723     UErrorCode status = U_ZERO_ERROR;
 724     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 725     if (U_FAILURE(status))
 726     {
 727         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 728         return;
 729     }
 730     UnicodeString str("...Hello, World!...");
 731     int32_t begin = 3;
 732     int32_t end = str.length() - 3;
 733     UBool onBoundary;
 734
 735     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 736     iter->adoptText(textIterator);
 737     int index;
 738     // Note: with the switch to UText, there is no way to restrict the
 739     //       iteration range to begin at an index other than zero.
 740     //       String character iterators created with a non-zero bound are
 741     //         treated by RBBI as being empty.
 742     for (index = -1; index < begin + 1; ++index) {
 743         onBoundary = iter->isBoundary(index);
 744         if (index == 0?  !onBoundary : onBoundary) {
 745             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 746                             " and begin index = " + begin);
 747         }
 748     }
 749     delete iter;
 750 }
 751
 752
 753 //
 754 // Test for problem reported by Ashok Matoria on 9 July 2007
 755 //    One.<kSoftHyphen><kSpace>Two.
 756 //
 757 //    Sentence break at start (0) and then on calling next() it breaks at
 758 //   'T' of "Two". Now, at this point if I do next() and
 759 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 760 //
 761 void RBBITest::TestBug5775() {
 762     UErrorCode status = U_ZERO_ERROR;
 763     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 764     TEST_ASSERT_SUCCESS(status);
 765     if (U_FAILURE(status)) {
 766         return;
 767     }
 768 // Check for status first for better handling of no data errors.
 769     TEST_ASSERT(bi != NULL);
 770     if (bi == NULL) {
 771         return;
 772     }
 773
 774     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 775     //               01234      56789
 776     s = s.unescape();
 777     bi->setText(s);
 778     int pos = bi->next();
 779     TEST_ASSERT(pos == 6);
 780     pos = bi->next();
 781     TEST_ASSERT(pos == 10);
 782     pos = bi->previous();
 783     TEST_ASSERT(pos == 6);
 784     delete bi;
 785 }
 786
 787
 788
 789 //------------------------------------------------------------------------------
 790 //
 791 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 792 //
 793 //------------------------------------------------------------------------------
 794
 795 struct TestParams {
 796     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 797                                            //   Changed out whenever test data changes break type.
 798
 799     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 800     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 801     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 802     UVector32       *srcCol;
 803
 804     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 805     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 806     CharString       utf8String;           // UTF-8 form of text to break.
 807
 808     TestParams(UErrorCode &status) : dataToBreak() {
 809         bi               = NULL;
 810         expectedBreaks   = new UVector32(status);
 811         srcLine          = new UVector32(status);
 812         srcCol           = new UVector32(status);
 813         textToBreak      = NULL;
 814         textMap          = new UVector32(status);
 815     }
 816
 817     ~TestParams() {
 818         delete bi;
 819         delete expectedBreaks;
 820         delete srcLine;
 821         delete srcCol;
 822         utext_close(textToBreak);
 823         delete textMap;
 824     }
 825
 826     int32_t getSrcLine(int32_t bp);
 827     int32_t getExpectedBreak(int32_t bp);
 828     int32_t getSrcCol(int32_t bp);
 829
 830     void setUTF16(UErrorCode &status);
 831     void setUTF8(UErrorCode &status);
 832 };
 833
 834 // Append a UnicodeString to a CharString with UTF-8 encoding.
 835 // Substitute any invalid chars.
 836 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 837 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 838     if (U_FAILURE(status)) {
 839         return;
 840     }
 841     int32_t utf8Length;
 842     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 843                        src.getBuffer(), src.length(),   // UTF-16 data
 844                        0xfffd, NULL,                    // Substitution char, number of subs.
 845                        &status);
 846     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 847         return;
 848     }
 849     status = U_ZERO_ERROR;
 850     int32_t capacity;
 851     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 852     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 853                        src.getBuffer(), src.length(),
 854                        0xfffd, NULL, &status);
 855     dest.append(buffer, utf8Length, status);
 856 }
 857
 858
 859 void TestParams::setUTF16(UErrorCode &status) {
 860     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 861     textMap->removeAllElements();
 862     for (int32_t i=0; i<dataToBreak.length(); i++) {
 863         if (i == dataToBreak.getChar32Start(i)) {
 864             textMap->addElement(i, status);
 865         } else {
 866             textMap->addElement(-1, status);
 867         }
 868     }
 869     textMap->addElement(dataToBreak.length(), status);
 870     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 871 }
 872
 873
 874 void TestParams::setUTF8(UErrorCode &status) {
 875     if (U_FAILURE(status)) {
 876         return;
 877     }
 878     utf8String.clear();
 879     CharStringAppend(utf8String, dataToBreak, status);
 880     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 881     if (U_FAILURE(status)) {
 882         return;
 883     }
 884
 885     textMap->removeAllElements();
 886     int32_t utf16Index = 0;
 887     for (;;) {
 888         textMap->addElement(utf16Index, status);
 889         UChar32 c32 = utext_current32(textToBreak);
 890         if (c32 < 0) {
 891             break;
 892         }
 893         utf16Index += U16_LENGTH(c32);
 894         utext_next32(textToBreak);
 895         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 896             textMap->addElement(-1, status);
 897         }
 898     }
 899     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 900 }
 901
 902
 903 int32_t TestParams::getSrcLine(int32_t bp) {
 904     if (bp >= textMap->size()) {
 905         bp = textMap->size() - 1;
 906     }
 907     int32_t i = 0;
 908     for(; bp >= 0 ; --bp) {
 909         // Move to a character boundary if we are not on one already.
 910         i = textMap->elementAti(bp);
 911         if (i >= 0) {
 912             break;
 913         }
 914     }
 915     return srcLine->elementAti(i);
 916 }
 917
 918
 919 int32_t TestParams::getExpectedBreak(int32_t bp) {
 920     if (bp >= textMap->size()) {
 921         return 0;
 922     }
 923     int32_t i = textMap->elementAti(bp);
 924     int32_t retVal = 0;
 925     if (i >= 0) {
 926         retVal = expectedBreaks->elementAti(i);
 927     }
 928     return retVal;
 929 }
 930
 931
 932 int32_t TestParams::getSrcCol(int32_t bp) {
 933     if (bp >= textMap->size()) {
 934         bp = textMap->size() - 1;
 935     }
 936     int32_t i = 0;
 937     for(; bp >= 0; --bp) {
 938         // Move bp to a character boundary if we are not on one already.
 939         i = textMap->elementAti(bp);
 940         if (i >= 0) {
 941             break;
 942         }
 943     }
 944     return srcCol->elementAti(i);
 945 }
 946
 947
 948 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 949     int32_t    bp;
 950     int32_t    prevBP;
 951     int32_t    i;
 952
 953     TEST_ASSERT_SUCCESS(status);
 954     if (U_FAILURE(status)) {
 955         return;
 956     }
 957
 958     if (t->bi == NULL) {
 959         return;
 960     }
 961
 962     t->bi->setText(t->textToBreak, status);
 963     //
 964     //  Run the iterator forward
 965     //
 966     prevBP = -1;
 967     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 968         if (prevBP ==  bp) {
 969             // Fail for lack of forward progress.
 970             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 971                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 972             break;
 973         }
 974
 975         // Check that there we didn't miss an expected break between the last one
 976         //  and this one.
 977         for (i=prevBP+1; i<bp; i++) {
 978             if (t->getExpectedBreak(i) != 0) {
 979                 int expected[] = {0, i};
 980                 printStringBreaks(t->dataToBreak, expected, 2);
 981                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 982                       i, t->getSrcLine(i), t->getSrcCol(i));
 983             }
 984         }
 985
 986         // Check that the break we did find was expected
 987         if (t->getExpectedBreak(bp) == 0) {
 988             int expected[] = {0, bp};
 989             printStringBreaks(t->textToBreak, expected, 2);
 990             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 991                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
 992         } else {
 993             // The break was expected.
 994             //   Check that the {nnn} tag value is correct.
 995             int32_t expectedTagVal = t->getExpectedBreak(bp);
 996             if (expectedTagVal == -1) {
 997                 expectedTagVal = 0;
 998             }
 999             int32_t line = t->getSrcLine(bp);
1000             int32_t rs = t->bi->getRuleStatus();
1001             if (rs != expectedTagVal) {
1002                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1003                       "          Actual, Expected status = %4d, %4d",
1004                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1005             }
1006         }
1007
1008         prevBP = bp;
1009     }
1010
1011     // Verify that there were no missed expected breaks after the last one found
1012     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1013         if (t->getExpectedBreak(i) != 0) {
1014             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1015                       i, t->getSrcLine(i), t->getSrcCol(i));
1016         }
1017     }
1018
1019     //
1020     //  Run the iterator backwards, verify that the same breaks are found.
1021     //
1022     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1023     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1024         if (prevBP ==  bp) {
1025             // Fail for lack of progress.
1026             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1027                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1028             break;
1029         }
1030
1031         // Check that we didn't miss an expected break between the last one
1032         //  and this one.  (UVector returns zeros for index out of bounds.)
1033         for (i=prevBP-1; i>bp; i--) {
1034             if (t->getExpectedBreak(i) != 0) {
1035                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1036                       i, t->getSrcLine(i), t->getSrcCol(i));
1037             }
1038         }
1039
1040         // Check that the break we did find was expected
1041         if (t->getExpectedBreak(bp) == 0) {
1042             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1043                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1044         } else {
1045             // The break was expected.
1046             //   Check that the {nnn} tag value is correct.
1047             int32_t expectedTagVal = t->getExpectedBreak(bp);
1048             if (expectedTagVal == -1) {
1049                 expectedTagVal = 0;
1050             }
1051             int line = t->getSrcLine(bp);
1052             int32_t rs = t->bi->getRuleStatus();
1053             if (rs != expectedTagVal) {
1054                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1055                       "          Actual, Expected status = %4d, %4d",
1056                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1057             }
1058         }
1059
1060         prevBP = bp;
1061     }
1062
1063     // Verify that there were no missed breaks prior to the last one found
1064     for (i=prevBP-1; i>=0; i--) {
1065         if (t->getExpectedBreak(i) != 0) {
1066             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1067                       i, t->getSrcLine(i), t->getSrcCol(i));
1068         }
1069     }
1070
1071     // Check isBoundary()
1072     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1073         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1074         UBool boundaryFound    = t->bi->isBoundary(i);
1075         if (boundaryExpected != boundaryFound) {
1076             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1077                   "        Expected, Actual= %s, %s",
1078                   i, t->getSrcLine(i), t->getSrcCol(i),
1079                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1080         }
1081     }
1082
1083     // Check following()
1084     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1085         int32_t actualBreak = t->bi->following(i);
1086         int32_t expectedBreak = BreakIterator::DONE;
1087         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1088             if (t->getExpectedBreak(j) != 0) {
1089                 expectedBreak = j;
1090                 break;
1091             }
1092         }
1093         if (expectedBreak != actualBreak) {
1094             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1095                   "        Expected, Actual= %d, %d",
1096                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1097         }
1098     }
1099
1100     // Check preceding()
1101     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1102         int32_t actualBreak = t->bi->preceding(i);
1103         int32_t expectedBreak = BreakIterator::DONE;
1104
1105         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1106         // preceding(trailing byte) will return the index of some preceding code point,
1107         // not the lead byte of the current code point, even though that has a smaller index.
1108         // Therefore, start looking at the expected break data not at i-1, but at
1109         // the start of code point index - 1.
1110         utext_setNativeIndex(t->textToBreak, i);
1111         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1112         for (; j >= 0; j--) {
1113             if (t->getExpectedBreak(j) != 0) {
1114                 expectedBreak = j;
1115                 break;
1116             }
1117         }
1118         if (expectedBreak != actualBreak) {
1119             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1120                   "        Expected, Actual= %d, %d",
1121                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1122         }
1123     }
1124 }
1125
1126
1127 void RBBITest::TestExtended() {
1128   // Skip test for now when UCONFIG_NO_FILTERED_BREAK_ITERATION is set. This
1129   // data driven test closely entangles filtered and regular data.
1130 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILTERED_BREAK_ITERATION
1131     UErrorCode      status  = U_ZERO_ERROR;
1132     Locale          locale("");
1133
1134     UnicodeString       rules;
1135     TestParams          tp(status);
1136
1137     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1138     if (U_FAILURE(status)) {
1139         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1140     }
1141
1142
1143     //
1144     //  Open and read the test data file.
1145     //
1146     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1147     char testFileName[1000];
1148     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1149         errln("Can't open test data.  Path too long.");
1150         return;
1151     }
1152     strcpy(testFileName, testDataDirectory);
1153     strcat(testFileName, "rbbitst.txt");
1154
1155     int    len;
1156     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1157     if (U_FAILURE(status)) {
1158         return; /* something went wrong, error already output */
1159     }
1160
1161
1162     bool skipTest = false; // Skip this test?
1163
1164     //
1165     //  Put the test data into a UnicodeString
1166     //
1167     UnicodeString testString(FALSE, testFile, len);
1168
1169     enum EParseState{
1170         PARSE_COMMENT,
1171         PARSE_TAG,
1172         PARSE_DATA,
1173         PARSE_NUM
1174     }
1175     parseState = PARSE_TAG;
1176
1177     EParseState savedState = PARSE_TAG;
1178
1179     int32_t    lineNum  = 1;
1180     int32_t    colStart = 0;
1181     int32_t    column   = 0;
1182     int32_t    charIdx  = 0;
1183
1184     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1185
1186     for (charIdx = 0; charIdx < len; ) {
1187         status = U_ZERO_ERROR;
1188         UChar  c = testString.charAt(charIdx);
1189         charIdx++;
1190         if (c == u'\r' && charIdx<len && testString.charAt(charIdx) == u'\n') {
1191             // treat CRLF as a unit
1192             c = u'\n';
1193             charIdx++;
1194         }
1195         if (c == u'\n' || c == u'\r') {
1196             lineNum++;
1197             colStart = charIdx;
1198         }
1199         column = charIdx - colStart + 1;
1200
1201         switch (parseState) {
1202         case PARSE_COMMENT:
1203             if (c == u'\n' || c == u'\r') {
1204                 parseState = savedState;
1205             }
1206             break;
1207
1208         case PARSE_TAG:
1209             {
1210             if (c == u'#') {
1211                 parseState = PARSE_COMMENT;
1212                 savedState = PARSE_TAG;
1213                 break;
1214             }
1215             if (u_isUWhiteSpace(c)) {
1216                 break;
1217             }
1218             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1219                 delete tp.bi;
1220                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1221                 skipTest = false;
1222                 charIdx += 5;
1223                 break;
1224             }
1225             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1226                 delete tp.bi;
1227                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1228                 skipTest = false;
1229                 charIdx += 5;
1230                 break;
1231             }
1232             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1233                 delete tp.bi;
1234                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1235                 skipTest = false;
1236                 charIdx += 5;
1237                 break;
1238             }
1239             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1240                 delete tp.bi;
1241                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1242                 skipTest = false;
1243                 charIdx += 5;
1244                 break;
1245             }
1246             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1247                 delete tp.bi;
1248                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1249                 charIdx += 6;
1250                 break;
1251             }
1252
1253             // <locale  loc_name>
1254             localeMatcher.reset(testString);
1255             if (localeMatcher.lookingAt(charIdx-1, status)) {
1256                 UnicodeString localeName = localeMatcher.group(1, status);
1257                 char localeName8[100];
1258                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1259                 locale = Locale::createFromName(localeName8);
1260                 charIdx += localeMatcher.group(0, status).length() - 1;
1261                 TEST_ASSERT_SUCCESS(status);
1262                 break;
1263             }
1264             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1265                 parseState = PARSE_DATA;
1266                 charIdx += 5;
1267                 tp.dataToBreak = "";
1268                 tp.expectedBreaks->removeAllElements();
1269                 tp.srcCol ->removeAllElements();
1270                 tp.srcLine->removeAllElements();
1271                 break;
1272             }
1273
1274             errln("line %d: Tag expected in test file.", lineNum);
1275             parseState = PARSE_COMMENT;
1276             savedState = PARSE_DATA;
1277             goto end_test; // Stop the test.
1278             }
1279             break;
1280
1281         case PARSE_DATA:
1282             if (c == u'\u2022') { // u'•'
1283                 int32_t  breakIdx = tp.dataToBreak.length();
1284                 tp.expectedBreaks->setSize(breakIdx+1);
1285                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1286                 tp.srcLine->setSize(breakIdx+1);
1287                 tp.srcLine->setElementAt(lineNum, breakIdx);
1288                 tp.srcCol ->setSize(breakIdx+1);
1289                 tp.srcCol ->setElementAt(column, breakIdx);
1290                 break;
1291             }
1292
1293             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1294                 // Add final entry to mappings from break location to source file position.
1295                 //  Need one extra because last break position returned is after the
1296                 //    last char in the data, not at the last char.
1297                 tp.srcLine->addElement(lineNum, status);
1298                 tp.srcCol ->addElement(column, status);
1299
1300                 parseState = PARSE_TAG;
1301                 charIdx += 6;
1302
1303                 if (!skipTest) {
1304                     // RUN THE TEST!
1305                     status = U_ZERO_ERROR;
1306                     tp.setUTF16(status);
1307                     executeTest(&tp, status);
1308                     TEST_ASSERT_SUCCESS(status);
1309
1310                     // Run again, this time with UTF-8 text wrapped in a UText.
1311                     status = U_ZERO_ERROR;
1312                     tp.setUTF8(status);
1313                     TEST_ASSERT_SUCCESS(status);
1314                     executeTest(&tp, status);
1315                 }
1316                 break;
1317             }
1318
1319             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1320                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1321                 // Get the code point from the name and insert it into the test data.
1322                 //   (Damn, no API takes names in Unicode  !!!
1323                 //    we've got to take it back to char *)
1324                 int32_t nameEndIdx = testString.indexOf(u'}', charIdx);
1325                 int32_t nameLength = nameEndIdx - (charIdx+2);
1326                 char charNameBuf[200];
1327                 UChar32 theChar = -1;
1328                 if (nameEndIdx != -1) {
1329                     UErrorCode status = U_ZERO_ERROR;
1330                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1331                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1332                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1333                     if (U_FAILURE(status)) {
1334                         theChar = -1;
1335                     }
1336                 }
1337                 if (theChar == -1) {
1338                     errln("Error in named character in test file at line %d, col %d",
1339                         lineNum, column);
1340                 } else {
1341                     // Named code point was recognized.  Insert it
1342                     //   into the test data.
1343                     tp.dataToBreak.append(theChar);
1344                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1345                         tp.srcLine->addElement(lineNum, status);
1346                         tp.srcCol ->addElement(column, status);
1347                     }
1348                 }
1349                 if (nameEndIdx > charIdx) {
1350                     charIdx = nameEndIdx+1;
1351
1352                 }
1353                 break;
1354             }
1355
1356
1357
1358
1359             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1360                 charIdx++;
1361                 int32_t  breakIdx = tp.dataToBreak.length();
1362                 tp.expectedBreaks->setSize(breakIdx+1);
1363                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1364                 tp.srcLine->setSize(breakIdx+1);
1365                 tp.srcLine->setElementAt(lineNum, breakIdx);
1366                 tp.srcCol ->setSize(breakIdx+1);
1367                 tp.srcCol ->setElementAt(column, breakIdx);
1368                 break;
1369             }
1370
1371             if (c == u'<') {
1372                 tagValue   = 0;
1373                 parseState = PARSE_NUM;
1374                 break;
1375             }
1376
1377             if (c == u'#' && column==3) {   // TODO:  why is column off so far?
1378                 parseState = PARSE_COMMENT;
1379                 savedState = PARSE_DATA;
1380                 break;
1381             }
1382
1383             if (c == u'\\') {
1384                 // Check for \ at end of line, a line continuation.
1385                 //     Advance over (discard) the newline
1386                 UChar32 cp = testString.char32At(charIdx);
1387                 if (cp == u'\r' && charIdx<len && testString.charAt(charIdx+1) == u'\n') {
1388                     // We have a CR LF
1389                     //  Need an extra increment of the input ptr to move over both of them
1390                     charIdx++;
1391                 }
1392                 if (cp == u'\n' || cp == u'\r') {
1393                     lineNum++;
1394                     colStart = charIdx;
1395                     charIdx++;
1396                     break;
1397                 }
1398
1399                 // Let unescape handle the back slash.
1400                 cp = testString.unescapeAt(charIdx);
1401                 if (cp != -1) {
1402                     // Escape sequence was recognized.  Insert the char
1403                     //   into the test data.
1404                     tp.dataToBreak.append(cp);
1405                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1406                         tp.srcLine->addElement(lineNum, status);
1407                         tp.srcCol ->addElement(column, status);
1408                     }
1409                     break;
1410                 }
1411
1412
1413                 // Not a recognized backslash escape sequence.
1414                 // Take the next char as a literal.
1415                 //  TODO:  Should this be an error?
1416                 c = testString.charAt(charIdx);
1417                 charIdx = testString.moveIndex32(charIdx, 1);
1418             }
1419
1420             // Normal, non-escaped data char.
1421             tp.dataToBreak.append(c);
1422
1423             // Save the mapping from offset in the data to line/column numbers in
1424             //   the original input file.  Will be used for better error messages only.
1425             //   If there's an expected break before this char, the slot in the mapping
1426             //     vector will already be set for this char; don't overwrite it.
1427             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1428                 tp.srcLine->addElement(lineNum, status);
1429                 tp.srcCol ->addElement(column, status);
1430             }
1431             break;
1432
1433
1434         case PARSE_NUM:
1435             // We are parsing an expected numeric tag value, like <1234>,
1436             //   within a chunk of data.
1437             if (u_isUWhiteSpace(c)) {
1438                 break;
1439             }
1440
1441             if (c == u'>') {
1442                 // Finished the number.  Add the info to the expected break data,
1443                 //   and switch parse state back to doing plain data.
1444                 parseState = PARSE_DATA;
1445                 if (tagValue == 0) {
1446                     tagValue = -1;
1447                 }
1448                 int32_t  breakIdx = tp.dataToBreak.length();
1449                 tp.expectedBreaks->setSize(breakIdx+1);
1450                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1451                 tp.srcLine->setSize(breakIdx+1);
1452                 tp.srcLine->setElementAt(lineNum, breakIdx);
1453                 tp.srcCol ->setSize(breakIdx+1);
1454                 tp.srcCol ->setElementAt(column, breakIdx);
1455                 break;
1456             }
1457
1458             if (u_isdigit(c)) {
1459                 tagValue = tagValue*10 + u_charDigitValue(c);
1460                 break;
1461             }
1462
1463             errln("Syntax Error in test file at line %d, col %d",
1464                 lineNum, column);
1465             parseState = PARSE_COMMENT;
1466             goto end_test; // Stop the test
1467             break;
1468         }
1469
1470
1471         if (U_FAILURE(status)) {
1472             dataerrln("ICU Error %s while parsing test file at line %d.",
1473                 u_errorName(status), lineNum);
1474             status = U_ZERO_ERROR;
1475             goto end_test; // Stop the test
1476         }
1477
1478     }
1479
1480 end_test:
1481     delete [] testFile;
1482 #endif
1483 }
1484
1485
1486 //-------------------------------------------------------------------------------
1487 //
1488 //  TestDictRules   create a break iterator from source rules that includes a
1489 //                  dictionary range.   Regression for bug #7130.  Source rules
1490 //                  do not declare a break iterator type (word, line, sentence, etc.
1491 //                  but the dictionary code, without a type, would loop.
1492 //
1493 //-------------------------------------------------------------------------------
1494 void RBBITest::TestDictRules() {
1495     const char *rules =  "$dictionary = [a-z]; \n"
1496                          "!!forward; \n"
1497                          "$dictionary $dictionary; \n"
1498                          "!!reverse; \n"
1499                          "$dictionary $dictionary; \n";
1500     const char *text = "aa";
1501     UErrorCode status = U_ZERO_ERROR;
1502     UParseError parseError;
1503
1504     RuleBasedBreakIterator bi(rules, parseError, status);
1505     if (U_SUCCESS(status)) {
1506         UnicodeString utext = text;
1507         bi.setText(utext);
1508         int32_t position;
1509         int32_t loops;
1510         for (loops = 0; loops<10; loops++) {
1511             position = bi.next();
1512             if (position == RuleBasedBreakIterator::DONE) {
1513                 break;
1514             }
1515         }
1516         TEST_ASSERT(loops == 1);
1517     } else {
1518         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1519     }
1520 }
1521
1522
1523
1524 //-------------------------------------------------------------------------------
1525 //
1526 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1527 //    return the data in one big UChar * buffer, which the caller must delete.
1528 //
1529 //    parameters:
1530 //          fileName:   the name of the file, with no directory part.  The test data directory
1531 //                      is assumed.
1532 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1533 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1534 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1535 //                      Pass NULL for the system default encoding.
1536 //          status
1537 //    returns:
1538 //                      The file data, converted to UChar.
1539 //                      The caller must delete this when done with
1540 //                           delete [] theBuffer;
1541 //
1542 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1543 //           Move this function to some common place.
1544 //
1545 //--------------------------------------------------------------------------------
1546 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1547     UChar       *retPtr  = NULL;
1548     char        *fileBuf = NULL;
1549     UConverter* conv     = NULL;
1550     FILE        *f       = NULL;
1551
1552     ulen = 0;
1553     if (U_FAILURE(status)) {
1554         return retPtr;
1555     }
1556
1557     //
1558     //  Open the file.
1559     //
1560     f = fopen(fileName, "rb");
1561     if (f == 0) {
1562         dataerrln("Error opening test data file %s\n", fileName);
1563         status = U_FILE_ACCESS_ERROR;
1564         return NULL;
1565     }
1566     //
1567     //  Read it in
1568     //
1569     int   fileSize;
1570     int   amt_read;
1571
1572     fseek( f, 0, SEEK_END);
1573     fileSize = ftell(f);
1574     fileBuf = new char[fileSize];
1575     fseek(f, 0, SEEK_SET);
1576     amt_read = fread(fileBuf, 1, fileSize, f);
1577     if (amt_read != fileSize || fileSize <= 0) {
1578         errln("Error reading test data file.");
1579         goto cleanUpAndReturn;
1580     }
1581
1582     //
1583     // Look for a Unicode Signature (BOM) on the data just read
1584     //
1585     int32_t        signatureLength;
1586     const char *   fileBufC;
1587     const char*    bomEncoding;
1588
1589     fileBufC = fileBuf;
1590     bomEncoding = ucnv_detectUnicodeSignature(
1591         fileBuf, fileSize, &signatureLength, &status);
1592     if(bomEncoding!=NULL ){
1593         fileBufC  += signatureLength;
1594         fileSize  -= signatureLength;
1595         encoding = bomEncoding;
1596     }
1597
1598     //
1599     // Open a converter to take the rule file to UTF-16
1600     //
1601     conv = ucnv_open(encoding, &status);
1602     if (U_FAILURE(status)) {
1603         goto cleanUpAndReturn;
1604     }
1605
1606     //
1607     // Convert the rules to UChar.
1608     //  Preflight first to determine required buffer size.
1609     //
1610     ulen = ucnv_toUChars(conv,
1611         NULL,           //  dest,
1612         0,              //  destCapacity,
1613         fileBufC,
1614         fileSize,
1615         &status);
1616     if (status == U_BUFFER_OVERFLOW_ERROR) {
1617         // Buffer Overflow is expected from the preflight operation.
1618         status = U_ZERO_ERROR;
1619
1620         retPtr = new UChar[ulen+1];
1621         ucnv_toUChars(conv,
1622             retPtr,       //  dest,
1623             ulen+1,
1624             fileBufC,
1625             fileSize,
1626             &status);
1627     }
1628
1629 cleanUpAndReturn:
1630     fclose(f);
1631     delete []fileBuf;
1632     ucnv_close(conv);
1633     if (U_FAILURE(status)) {
1634         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1635         delete []retPtr;
1636         retPtr = 0;
1637         ulen   = 0;
1638     };
1639     return retPtr;
1640 }
1641
1642
1643
1644 //--------------------------------------------------------------------------------------------
1645 //
1646 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1647 //
1648 //-------------------------------------------------------------------------------------------
1649 void RBBITest::TestUnicodeFiles() {
1650     RuleBasedBreakIterator  *bi;
1651     UErrorCode               status = U_ZERO_ERROR;
1652
1653     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1654     TEST_ASSERT_SUCCESS(status);
1655     if (U_SUCCESS(status)) {
1656         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1657     }
1658     delete bi;
1659
1660     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1661     TEST_ASSERT_SUCCESS(status);
1662     if (U_SUCCESS(status)) {
1663         runUnicodeTestData("WordBreakTest.txt", bi);
1664     }
1665     delete bi;
1666
1667     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1668     TEST_ASSERT_SUCCESS(status);
1669     if (U_SUCCESS(status)) {
1670         runUnicodeTestData("SentenceBreakTest.txt", bi);
1671     }
1672     delete bi;
1673
1674     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1675     TEST_ASSERT_SUCCESS(status);
1676     if (U_SUCCESS(status)) {
1677         runUnicodeTestData("LineBreakTest.txt", bi);
1678     }
1679     delete bi;
1680 }
1681
1682
1683 // Check for test cases from the Unicode test data files that are known to fail
1684 // and should be skipped because ICU is not yet able to fully implement the spec.
1685 // See ticket #7270.
1686
1687 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1688     static struct TestCase {
1689         const char *fFileName;
1690         const UChar *fString;
1691     } badTestCases[] = {                                // Line Numbers from Unicode 7.0.0 file.
1692         {"LineBreakTest.txt", u"\u200B\u0020}"},        // Line 5198
1693         {"LineBreakTest.txt", u"\u200B\u0020)"},        // Line 5202
1694         {"LineBreakTest.txt", u"\u200B\u0020!"},        // Line 5214
1695         {"LineBreakTest.txt", u"\u200B\u0020,"},        // Line 5246
1696         {"LineBreakTest.txt", u"\u200B\u0020/"},        // Line 5298
1697         {"LineBreakTest.txt", u"\u200B\u0020\u2060"},   // Line 5302
1698                                                         // Line Numbers from pre-release verion of GraphemeBreakTest-10.0.0.txt
1699         {"GraphemeBreakTest.txt", u"\u200D\u2640"},     // Line 656, old GB 11 test ZWJ x GAZ
1700         {"GraphemeBreakTest.txt", u"\u200D\U0001F466"}, // Line 658, old GB 11 test ZWJ x EBG
1701         {"GraphemeBreakTest.txt", u"\u200D\U0001F466\U0001F3FB"}, // Line 842, old GB 11 test ZWJ x EBG x EModifier
1702
1703                                                         // Line Numbers from pre-release verion of WordBreakTest-10.0.0.txt
1704         {"WordBreakTest.txt", u"\u200D\u261D"},         // Line 1356, ZWJ x EmojiNRK
1705         {"WordBreakTest.txt", u"\u200D\U0001F3FB"},     // Line 1358, ZWJ x EmojiNRK
1706     };
1707
1708     for (int n=0; n<UPRV_LENGTHOF(badTestCases); n++) {
1709         const TestCase &badCase = badTestCases[n];
1710         if (!strcmp(fileName, badCase.fFileName) &&
1711                 testCase == UnicodeString(badCase.fString)) {
1712             return logKnownIssue("7270");
1713         }
1714     }
1715     return FALSE;
1716 }
1717
1718
1719 //--------------------------------------------------------------------------------------------
1720 //
1721 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1722 //
1723 //-------------------------------------------------------------------------------------------
1724 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1725 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1726     UErrorCode  status = U_ZERO_ERROR;
1727
1728     //
1729     //  Open and read the test data file, put it into a UnicodeString.
1730     //
1731     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1732     char testFileName[1000];
1733     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1734         dataerrln("Can't open test data.  Path too long.");
1735         return;
1736     }
1737     strcpy(testFileName, testDataDirectory);
1738     strcat(testFileName, fileName);
1739
1740     logln("Opening data file %s\n", fileName);
1741
1742     int    len;
1743     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1744     if (status != U_FILE_ACCESS_ERROR) {
1745         TEST_ASSERT_SUCCESS(status);
1746         TEST_ASSERT(testFile != NULL);
1747     }
1748     if (U_FAILURE(status) || testFile == NULL) {
1749         return; /* something went wrong, error already output */
1750     }
1751     UnicodeString testFileAsString(TRUE, testFile, len);
1752
1753     //
1754     //  Parse the test data file using a regular expression.
1755     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1756     //     is identified by which group had a match.
1757     //
1758     //    Caputure Group #                  1          2            3            4           5
1759     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1760     //
1761     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1762     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1763     UnicodeString   testString;
1764     UVector32       breakPositions(status);
1765     int             lineNumber = 1;
1766     TEST_ASSERT_SUCCESS(status);
1767     if (U_FAILURE(status)) {
1768         return;
1769     }
1770
1771     //
1772     //  Scan through each test case, building up the string to be broken in testString,
1773     //   and the positions that should be boundaries in the breakPositions vector.
1774     //
1775     int spin = 0;
1776     while (tokenMatcher.find()) {
1777         if(tokenMatcher.hitEnd()) {
1778           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1779              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1780              and caused an infinite loop here on EBCDIC systems!
1781           */
1782           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1783           //       return;
1784         }
1785         if (tokenMatcher.start(1, status) >= 0) {
1786             // Scanned a divide sign, indicating a break position in the test data.
1787             if (testString.length()>0) {
1788                 breakPositions.addElement(testString.length(), status);
1789             }
1790         }
1791         else if (tokenMatcher.start(2, status) >= 0) {
1792             // Scanned an 'x', meaning no break at this position in the test data
1793             //   Nothing to be done here.
1794             }
1795         else if (tokenMatcher.start(3, status) >= 0) {
1796             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1797             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1798             int length = hexNumber.length();
1799             if (length<=8) {
1800                 char buf[10];
1801                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1802                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1803                 if (c<=0x10ffff) {
1804                     testString.append(c);
1805                 } else {
1806                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1807                        fileName, lineNumber);
1808                 }
1809             } else {
1810                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1811                        fileName, lineNumber);
1812              }
1813         }
1814         else if (tokenMatcher.start(4, status) >= 0) {
1815             // Scanned to end of a line, possibly skipping over a comment in the process.
1816             //   If the line from the file contained test data, run the test now.
1817             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1818                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1819             }
1820
1821             // Clear out this test case.
1822             //    The string and breakPositions vector will be refilled as the next
1823             //       test case is parsed.
1824             testString.remove();
1825             breakPositions.removeAllElements();
1826             lineNumber++;
1827         } else {
1828             // Scanner catchall.  Something unrecognized appeared on the line.
1829             char token[16];
1830             UnicodeString uToken = tokenMatcher.group(0, status);
1831             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1832             token[sizeof(token)-1] = 0;
1833             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1834
1835             // Clean up, in preparation for continuing with the next line.
1836             testString.remove();
1837             breakPositions.removeAllElements();
1838             lineNumber++;
1839         }
1840         TEST_ASSERT_SUCCESS(status);
1841         if (U_FAILURE(status)) {
1842             break;
1843         }
1844     }
1845
1846     delete [] testFile;
1847  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1848 }
1849
1850 //--------------------------------------------------------------------------------------------
1851 //
1852 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1853 //                            test data files.  Do only a simple, forward-only check -
1854 //                            this test is mostly to check that ICU and the Unicode
1855 //                            data agree with each other.
1856 //
1857 //--------------------------------------------------------------------------------------------
1858 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1859                          const UnicodeString &testString,   // Text data to be broken
1860                          UVector32 *breakPositions,         // Positions where breaks should be found.
1861                          RuleBasedBreakIterator *bi) {
1862     int32_t pos;                 // Break Position in the test string
1863     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1864     int32_t expectedPos;         // Expected break position (index into test string)
1865
1866     bi->setText(testString);
1867     pos = bi->first();
1868     pos = bi->next();
1869
1870     while (pos != BreakIterator::DONE) {
1871         if (expectedI >= breakPositions->size()) {
1872             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1873                 testFileName, lineNumber, pos);
1874             break;
1875         }
1876         expectedPos = breakPositions->elementAti(expectedI);
1877         if (pos < expectedPos) {
1878             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1879                 testFileName, lineNumber, pos);
1880             break;
1881         }
1882         if (pos > expectedPos) {
1883             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1884                 testFileName, lineNumber, expectedPos);
1885             break;
1886         }
1887         pos = bi->next();
1888         expectedI++;
1889     }
1890
1891     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1892         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1893             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1894     }
1895 }
1896
1897
1898
1899 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1900 //---------------------------------------------------------------------------------------
1901 //
1902 //   classs RBBIMonkeyKind
1903 //
1904 //      Monkey Test for Break Iteration
1905 //      Abstract interface class.   Concrete derived classes independently
1906 //      implement the break rules for different iterator types.
1907 //
1908 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1909 //      testing, but works purely in terms of the interface defined here.
1910 //
1911 //---------------------------------------------------------------------------------------
1912 class RBBIMonkeyKind {
1913 public:
1914     // Return a UVector of UnicodeSets, representing the character classes used
1915     //   for this type of iterator.
1916     virtual  UVector  *charClasses() = 0;
1917
1918     // Set the test text on which subsequent calls to next() will operate
1919     virtual  void      setText(const UnicodeString &s) = 0;
1920
1921     // Find the next break postion, starting from the prev break position, or from zero.
1922     // Return -1 after reaching end of string.
1923     virtual  int32_t   next(int32_t i) = 0;
1924
1925     virtual ~RBBIMonkeyKind();
1926     UErrorCode       deferredStatus;
1927
1928
1929 protected:
1930     RBBIMonkeyKind();
1931
1932 private:
1933 };
1934
1935 RBBIMonkeyKind::RBBIMonkeyKind() {
1936     deferredStatus = U_ZERO_ERROR;
1937 }
1938
1939 RBBIMonkeyKind::~RBBIMonkeyKind() {
1940 }
1941
1942
1943 //----------------------------------------------------------------------------------------
1944 //
1945 //   Random Numbers.  Similar to standard lib rand() and srand()
1946 //                    Not using library to
1947 //                      1.  Get same results on all platforms.
1948 //                      2.  Get access to current seed, to more easily reproduce failures.
1949 //
1950 //---------------------------------------------------------------------------------------
1951 static uint32_t m_seed = 1;
1952
1953 static uint32_t m_rand()
1954 {
1955     m_seed = m_seed * 1103515245 + 12345;
1956     return (uint32_t)(m_seed/65536) % 32768;
1957 }
1958
1959
1960 //
1961 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r13267
1962 //
1963 static const char16_t *gExtended_Pict = u"["
1964     "\\U0001F774-\\U0001F77F\\U00002700-\\U00002701\\U00002703-\\U00002704\\U0000270E\\U00002710-\\U00002711\\U00002765-\\U00002767"
1965     "\\U0001F030-\\U0001F093\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5"
1966     "\\U0001F260-\\U0001F265\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F25F"
1967     "\\U0001F266-\\U0001F2FF\\U0001F7D5-\\U0001F7FF\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F"
1968     "\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6"
1969     "\\U0001F4FE\\U0001F53E-\\U0001F548\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586"
1970     "\\U0001F588-\\U0001F589\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7"
1971     "\\U0001F5A9-\\U0001F5B0\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB"
1972     "\\U0001F5DF-\\U0001F5E0\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9"
1973     "\\U00002605\\U00002607-\\U0000260D\\U0000260F-\\U00002610\\U00002612\\U00002616-\\U00002617\\U00002619-\\U0000261C"
1974     "\\U0000261E-\\U0000261F\\U00002621\\U00002624-\\U00002625\\U00002627-\\U00002629\\U0000262B-\\U0000262D\\U00002630-\\U00002637"
1975     "\\U0000263B-\\U00002647\\U00002654-\\U0000265F\\U00002661-\\U00002662\\U00002664\\U00002667\\U00002669-\\U0000267A"
1976     "\\U0000267C-\\U0000267E\\U00002680-\\U00002691\\U00002695\\U00002698\\U0000269A\\U0000269D-\\U0000269F\\U000026A2-\\U000026A9"
1977     "\\U000026AC-\\U000026AF\\U000026B2-\\U000026BC\\U000026BF-\\U000026C3\\U000026C6-\\U000026C7\\U000026C9-\\U000026CD"
1978     "\\U000026D0\\U000026D2\\U000026D5-\\U000026E8\\U000026EB-\\U000026EF\\U000026F6\\U000026FB-\\U000026FC\\U000026FE-\\U000026FF"
1979     "\\U00002388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5"
1980     "\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F"
1981     "\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF\\U0001F900-\\U0001F90B\\U0001F91F\\U0001F928-\\U0001F92F"
1982     "\\U0001F931-\\U0001F932\\U0001F94C\\U0001F95F-\\U0001F96B\\U0001F992-\\U0001F997\\U0001F9D0-\\U0001F9E6\\U0001F90C-\\U0001F90F"
1983     "\\U0001F93F\\U0001F94D-\\U0001F94F\\U0001F96C-\\U0001F97F\\U0001F998-\\U0001F9BF\\U0001F9C1-\\U0001F9CF\\U0001F9E7-\\U0001F9FF"
1984     "\\U0001F6C6-\\U0001F6CA\\U0001F6D3-\\U0001F6D4\\U0001F6E6-\\U0001F6E8\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6F7-\\U0001F6F8"
1985     "\\U0001F6D5-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F9-\\U0001F6FF"
1986     "]";
1987
1988 //------------------------------------------------------------------------------------------
1989 //
1990 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1991 //                             of RBBIMonkeyKind.
1992 //
1993 //------------------------------------------------------------------------------------------
1994 class RBBICharMonkey: public RBBIMonkeyKind {
1995 public:
1996     RBBICharMonkey();
1997     virtual          ~RBBICharMonkey();
1998     virtual  UVector *charClasses();
1999     virtual  void     setText(const UnicodeString &s);
2000     virtual  int32_t  next(int32_t i);
2001 private:
2002     UVector   *fSets;
2003
2004     UnicodeSet  *fCRLFSet;
2005     UnicodeSet  *fControlSet;
2006     UnicodeSet  *fExtendSet;
2007     UnicodeSet  *fZWJSet;
2008     UnicodeSet  *fRegionalIndicatorSet;
2009     UnicodeSet  *fPrependSet;
2010     UnicodeSet  *fSpacingSet;
2011     UnicodeSet  *fLSet;
2012     UnicodeSet  *fVSet;
2013     UnicodeSet  *fTSet;
2014     UnicodeSet  *fLVSet;
2015     UnicodeSet  *fLVTSet;
2016     UnicodeSet  *fHangulSet;
2017     UnicodeSet  *fEmojiBaseSet;
2018     UnicodeSet  *fEmojiModifierSet;
2019     UnicodeSet  *fExtendedPictSet;
2020     UnicodeSet  *fEBGSet;
2021     UnicodeSet  *fEmojiNRKSet;
2022     UnicodeSet  *fAnySet;
2023
2024     const UnicodeString *fText;
2025 };
2026
2027
2028 RBBICharMonkey::RBBICharMonkey() {
2029     UErrorCode  status = U_ZERO_ERROR;
2030
2031     fText = NULL;
2032
2033     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2034     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]]"), status);
2035     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}]]"), status);
2036     fZWJSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = ZWJ}]"), status);
2037     fRegionalIndicatorSet =
2038                   new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2039     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2040     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2041     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2042     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2043     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2044     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2045     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2046     fHangulSet  = new UnicodeSet();
2047     fHangulSet->addAll(*fLSet);
2048     fHangulSet->addAll(*fVSet);
2049     fHangulSet->addAll(*fTSet);
2050     fHangulSet->addAll(*fLVSet);
2051     fHangulSet->addAll(*fLVTSet);
2052
2053     fEmojiBaseSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
2054     fEmojiModifierSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EM}]"), status);
2055     fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
2056     fEBGSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = EBG}]"), status);
2057     fEmojiNRKSet      = new UnicodeSet(UNICODE_STRING_SIMPLE(
2058                 "[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
2059     fAnySet           = new UnicodeSet(0, 0x10ffff);
2060
2061     fSets             = new UVector(status);
2062     fSets->addElement(fCRLFSet,    status);
2063     fSets->addElement(fControlSet, status);
2064     fSets->addElement(fExtendSet,  status);
2065     fSets->addElement(fRegionalIndicatorSet, status);
2066     if (!fPrependSet->isEmpty()) {
2067         fSets->addElement(fPrependSet, status);
2068     }
2069     fSets->addElement(fSpacingSet, status);
2070     fSets->addElement(fHangulSet,  status);
2071     fSets->addElement(fAnySet,     status);
2072     fSets->addElement(fEmojiBaseSet, status);
2073     fSets->addElement(fEmojiModifierSet, status);
2074     fSets->addElement(fZWJSet,     status);
2075     fSets->addElement(fExtendedPictSet, status);
2076     fSets->addElement(fEBGSet,     status);
2077     fSets->addElement(fEmojiNRKSet,status);
2078     if (U_FAILURE(status)) {
2079         deferredStatus = status;
2080     }
2081 }
2082
2083
2084 void RBBICharMonkey::setText(const UnicodeString &s) {
2085     fText = &s;
2086 }
2087
2088
2089
2090 int32_t RBBICharMonkey::next(int32_t prevPos) {
2091     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2092                               //   break position being tested.  The candidate break
2093                               //   location is before p2.
2094
2095     int     breakPos = -1;
2096
2097     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2098     UChar32 cBase;            // for (X Extend*) patterns, the X character.
2099
2100     if (U_FAILURE(deferredStatus)) {
2101         return -1;
2102     }
2103
2104     // Previous break at end of string.  return DONE.
2105     if (prevPos >= fText->length()) {
2106         return -1;
2107     }
2108     p0 = p1 = p2 = p3 = prevPos;
2109     c3 =  fText->char32At(prevPos);
2110     c0 = c1 = c2 = cBase = 0;
2111     (void)p0;   // suppress set but not used warning.
2112     (void)c0;
2113
2114     // Loop runs once per "significant" character position in the input text.
2115     for (;;) {
2116         // Move all of the positions forward in the input string.
2117         p0 = p1;  c0 = c1;
2118         p1 = p2;  c1 = c2;
2119         p2 = p3;  c2 = c3;
2120
2121         // Advancd p3 by one codepoint
2122         p3 = fText->moveIndex32(p3, 1);
2123         c3 = fText->char32At(p3);
2124
2125         if (p1 == p2) {
2126             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2127             continue;
2128         }
2129         if (p2 == fText->length()) {
2130             // Reached end of string.  Always a break position.
2131             break;
2132         }
2133
2134         // Rule  GB3   CR x LF
2135         //     No Extend or Format characters may appear between the CR and LF,
2136         //     which requires the additional check for p2 immediately following p1.
2137         //
2138         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2139             continue;
2140         }
2141
2142         // Rule (GB4).   ( Control | CR | LF ) <break>
2143         if (fControlSet->contains(c1) ||
2144             c1 == 0x0D ||
2145             c1 == 0x0A)  {
2146             break;
2147         }
2148
2149         // Rule (GB5)    <break>  ( Control | CR | LF )
2150         //
2151         if (fControlSet->contains(c2) ||
2152             c2 == 0x0D ||
2153             c2 == 0x0A)  {
2154             break;
2155         }
2156
2157
2158         // Rule (GB6)  L x ( L | V | LV | LVT )
2159         if (fLSet->contains(c1) &&
2160                (fLSet->contains(c2)  ||
2161                 fVSet->contains(c2)  ||
2162                 fLVSet->contains(c2) ||
2163                 fLVTSet->contains(c2))) {
2164             continue;
2165         }
2166
2167         // Rule (GB7)    ( LV | V )  x  ( V | T )
2168         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2169             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2170             continue;
2171         }
2172
2173         // Rule (GB8)    ( LVT | T)  x T
2174         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2175             fTSet->contains(c2))  {
2176             continue;
2177         }
2178
2179         // Rule (GB9)    x (Extend | ZWJ)
2180         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2181             if (!fExtendSet->contains(c1)) {
2182                 cBase = c1;
2183             }
2184             continue;
2185         }
2186
2187         // Rule (GB9a)   x  SpacingMark
2188         if (fSpacingSet->contains(c2)) {
2189             continue;
2190         }
2191
2192         // Rule (GB9b)   Prepend x
2193         if (fPrependSet->contains(c1)) {
2194             continue;
2195         }
2196
2197         // Rule (GB10)   (Emoji_Base | EBG) Extend * x Emoji_Modifier
2198         if ((fEmojiBaseSet->contains(c1) || fEBGSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2199             continue;
2200         }
2201         if ((fEmojiBaseSet->contains(cBase) || fEBGSet->contains(cBase)) &&
2202                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2203             continue;
2204         }
2205
2206         // Rule (GB11)   (Glue_After_ZWJ | Emoji) Extend * ZWJ x (Glue_After_ZWJ | Emoji)
2207         if ((fExtendedPictSet->contains(c0) || fEmojiNRKSet->contains(c0)) && fZWJSet->contains(c1) &&
2208                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2209             continue;
2210         }
2211         if ((fExtendedPictSet->contains(cBase) || fEmojiNRKSet->contains(cBase)) && fExtendSet->contains(c0) && fZWJSet->contains(c1) &&
2212                 (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2213             continue;
2214         }
2215
2216         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2217         //                   Note: The first if condition is a little tricky. We only need to force
2218         //                      a break if there are three or more contiguous RIs. If there are
2219         //                      only two, a break following will occur via other rules, and will include
2220         //                      any trailing extend characters, which is needed behavior.
2221         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2222                 && fRegionalIndicatorSet->contains(c2)) {
2223             break;
2224         }
2225         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2226             continue;
2227         }
2228
2229         // Rule (GB999)  Any  <break>  Any
2230         break;
2231     }
2232
2233     breakPos = p2;
2234     return breakPos;
2235 }
2236
2237
2238
2239 UVector  *RBBICharMonkey::charClasses() {
2240     return fSets;
2241 }
2242
2243
2244 RBBICharMonkey::~RBBICharMonkey() {
2245     delete fSets;
2246     delete fCRLFSet;
2247     delete fControlSet;
2248     delete fExtendSet;
2249     delete fRegionalIndicatorSet;
2250     delete fPrependSet;
2251     delete fSpacingSet;
2252     delete fLSet;
2253     delete fVSet;
2254     delete fTSet;
2255     delete fLVSet;
2256     delete fLVTSet;
2257     delete fHangulSet;
2258     delete fAnySet;
2259     delete fEmojiBaseSet;
2260     delete fEmojiModifierSet;
2261     delete fZWJSet;
2262     delete fExtendedPictSet;
2263     delete fEBGSet;
2264     delete fEmojiNRKSet;
2265 }
2266
2267 //------------------------------------------------------------------------------------------
2268 //
2269 //   class RBBIWordMonkey      Word Break specific implementation
2270 //                             of RBBIMonkeyKind.
2271 //
2272 //------------------------------------------------------------------------------------------
2273 class RBBIWordMonkey: public RBBIMonkeyKind {
2274 public:
2275     RBBIWordMonkey();
2276     virtual          ~RBBIWordMonkey();
2277     virtual  UVector *charClasses();
2278     virtual  void     setText(const UnicodeString &s);
2279     virtual int32_t   next(int32_t i);
2280 private:
2281     UVector      *fSets;
2282
2283     UnicodeSet  *fCRSet;
2284     UnicodeSet  *fLFSet;
2285     UnicodeSet  *fNewlineSet;
2286     UnicodeSet  *fRegionalIndicatorSet;
2287     UnicodeSet  *fKatakanaSet;
2288     UnicodeSet  *fHebrew_LetterSet;
2289     UnicodeSet  *fALetterSet;
2290     UnicodeSet  *fSingle_QuoteSet;
2291     UnicodeSet  *fDouble_QuoteSet;
2292     UnicodeSet  *fMidNumLetSet;
2293     UnicodeSet  *fMidLetterSet;
2294     UnicodeSet  *fMidNumSet;
2295     UnicodeSet  *fNumericSet;
2296     UnicodeSet  *fFormatSet;
2297     UnicodeSet  *fOtherSet;
2298     UnicodeSet  *fExtendSet;
2299     UnicodeSet  *fExtendNumLetSet;
2300     UnicodeSet  *fDictionarySet;
2301     UnicodeSet  *fEBaseSet;
2302     UnicodeSet  *fEBGSet;
2303     UnicodeSet  *fEModifierSet;
2304     UnicodeSet  *fZWJSet;
2305     UnicodeSet  *fExtendedPictSet;
2306     UnicodeSet  *fEmojiNRKSet;
2307
2308     const UnicodeString  *fText;
2309 };
2310
2311
2312 RBBIWordMonkey::RBBIWordMonkey()
2313 {
2314     UErrorCode  status = U_ZERO_ERROR;
2315
2316     fSets            = new UVector(status);
2317
2318     fCRSet            = new UnicodeSet(u"[\\p{Word_Break = CR}]",           status);
2319     fLFSet            = new UnicodeSet(u"[\\p{Word_Break = LF}]",           status);
2320     fNewlineSet       = new UnicodeSet(u"[\\p{Word_Break = Newline}]",      status);
2321     fKatakanaSet      = new UnicodeSet(u"[\\p{Word_Break = Katakana}]",     status);
2322     fRegionalIndicatorSet =  new UnicodeSet(u"[\\p{Word_Break = Regional_Indicator}]", status);
2323     fHebrew_LetterSet = new UnicodeSet(u"[\\p{Word_Break = Hebrew_Letter}]", status);
2324     fALetterSet       = new UnicodeSet(u"[\\p{Word_Break = ALetter}]", status);
2325     fSingle_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Single_Quote}]",    status);
2326     fDouble_QuoteSet  = new UnicodeSet(u"[\\p{Word_Break = Double_Quote}]",    status);
2327     fMidNumLetSet     = new UnicodeSet(u"[\\p{Word_Break = MidNumLet}]",    status);
2328     fMidLetterSet     = new UnicodeSet(u"[\\p{Word_Break = MidLetter} - [\\:]]",    status);
2329     fMidNumSet        = new UnicodeSet(u"[\\p{Word_Break = MidNum}]",       status);
2330     fNumericSet       = new UnicodeSet(u"[\\p{Word_Break = Numeric}]",      status);
2331     fFormatSet        = new UnicodeSet(u"[\\p{Word_Break = Format}]",       status);
2332     fExtendNumLetSet  = new UnicodeSet(u"[\\p{Word_Break = ExtendNumLet}]", status);
2333     fExtendSet        = new UnicodeSet(u"[\\p{Word_Break = Extend}]",       status);
2334
2335     fEBaseSet         = new UnicodeSet(u"[[\\p{Word_Break = EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]", status);
2336     fEBGSet           = new UnicodeSet(u"[\\p{Word_Break = EBG}]",          status);
2337     fEModifierSet     = new UnicodeSet(u"[\\p{Word_Break = EM}]",           status);
2338     fZWJSet           = new UnicodeSet(u"[\\p{Word_Break = ZWJ}]",          status);
2339     fExtendedPictSet  = new UnicodeSet(gExtended_Pict, status);
2340     fEmojiNRKSet      = new UnicodeSet(
2341             u"[[\\p{Emoji}]-[\\p{Word_Break = Regional_Indicator}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]", status);
2342
2343     fDictionarySet = new UnicodeSet(u"[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]", status);
2344     fDictionarySet->addAll(*fKatakanaSet);
2345     fDictionarySet->addAll(UnicodeSet(u"[\\p{LineBreak = Complex_Context}]", status));
2346
2347     fALetterSet->removeAll(*fDictionarySet);
2348
2349     fOtherSet        = new UnicodeSet();
2350     if(U_FAILURE(status)) {
2351         IntlTest::gTest->errln("%s:%d %s", __FILE__, __LINE__, u_errorName(status));
2352         deferredStatus = status;
2353         return;
2354     }
2355
2356     fOtherSet->complement();
2357     fOtherSet->removeAll(*fCRSet);
2358     fOtherSet->removeAll(*fLFSet);
2359     fOtherSet->removeAll(*fNewlineSet);
2360     fOtherSet->removeAll(*fKatakanaSet);
2361     fOtherSet->removeAll(*fHebrew_LetterSet);
2362     fOtherSet->removeAll(*fALetterSet);
2363     fOtherSet->removeAll(*fSingle_QuoteSet);
2364     fOtherSet->removeAll(*fDouble_QuoteSet);
2365     fOtherSet->removeAll(*fMidLetterSet);
2366     fOtherSet->removeAll(*fMidNumSet);
2367     fOtherSet->removeAll(*fNumericSet);
2368     fOtherSet->removeAll(*fExtendNumLetSet);
2369     fOtherSet->removeAll(*fFormatSet);
2370     fOtherSet->removeAll(*fExtendSet);
2371     fOtherSet->removeAll(*fRegionalIndicatorSet);
2372     fOtherSet->removeAll(*fEBaseSet);
2373     fOtherSet->removeAll(*fEBGSet);
2374     fOtherSet->removeAll(*fEModifierSet);
2375     fOtherSet->removeAll(*fZWJSet);
2376     fOtherSet->removeAll(*fExtendedPictSet);
2377     fOtherSet->removeAll(*fEmojiNRKSet);
2378
2379     // Inhibit dictionary characters from being tested at all.
2380     fOtherSet->removeAll(*fDictionarySet);
2381
2382     fSets->addElement(fCRSet,                status);
2383     fSets->addElement(fLFSet,                status);
2384     fSets->addElement(fNewlineSet,           status);
2385     fSets->addElement(fRegionalIndicatorSet, status);
2386     fSets->addElement(fHebrew_LetterSet,     status);
2387     fSets->addElement(fALetterSet,           status);
2388     fSets->addElement(fSingle_QuoteSet,      status);
2389     fSets->addElement(fDouble_QuoteSet,      status);
2390     //fSets->addElement(fKatakanaSet,          status); // Omit Katakana from fSets, which omits Katakana characters
2391                                                         // from the test data. They are all in the dictionary set,
2392                                                         // which this (old, to be retired) monkey test cannot handle.
2393     fSets->addElement(fMidLetterSet,         status);
2394     fSets->addElement(fMidNumLetSet,         status);
2395     fSets->addElement(fMidNumSet,            status);
2396     fSets->addElement(fNumericSet,           status);
2397     fSets->addElement(fFormatSet,            status);
2398     fSets->addElement(fExtendSet,            status);
2399     fSets->addElement(fOtherSet,             status);
2400     fSets->addElement(fExtendNumLetSet,      status);
2401
2402     fSets->addElement(fEBaseSet,             status);
2403     fSets->addElement(fEBGSet,               status);
2404     fSets->addElement(fEModifierSet,         status);
2405     fSets->addElement(fZWJSet,               status);
2406     fSets->addElement(fExtendedPictSet,      status);
2407     fSets->addElement(fEmojiNRKSet,          status);
2408
2409     if (U_FAILURE(status)) {
2410         deferredStatus = status;
2411     }
2412 }
2413
2414 void RBBIWordMonkey::setText(const UnicodeString &s) {
2415     fText       = &s;
2416 }
2417
2418
2419 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2420     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2421                               //   break position being tested.  The candidate break
2422                               //   location is before p2.
2423
2424     int     breakPos = -1;
2425
2426     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2427
2428     if (U_FAILURE(deferredStatus)) {
2429         return -1;
2430     }
2431
2432     // Prev break at end of string.  return DONE.
2433     if (prevPos >= fText->length()) {
2434         return -1;
2435     }
2436     p0 = p1 = p2 = p3 = prevPos;
2437     c3 =  fText->char32At(prevPos);
2438     c0 = c1 = c2 = 0;
2439     (void)p0;       // Suppress set but not used warning.
2440
2441     // Loop runs once per "significant" character position in the input text.
2442     for (;;) {
2443         // Move all of the positions forward in the input string.
2444         p0 = p1;  c0 = c1;
2445         p1 = p2;  c1 = c2;
2446         p2 = p3;  c2 = c3;
2447
2448         // Advancd p3 by    X(Extend | Format)*   Rule 4
2449         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2450         do {
2451             p3 = fText->moveIndex32(p3, 1);
2452             c3 = fText->char32At(p3);
2453             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2454                break;
2455             };
2456         }
2457         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWJSet->contains(c3));
2458
2459
2460         if (p1 == p2) {
2461             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2462             continue;
2463         }
2464         if (p2 == fText->length()) {
2465             // Reached end of string.  Always a break position.
2466             break;
2467         }
2468
2469         // Rule  (3)   CR x LF
2470         //     No Extend or Format characters may appear between the CR and LF,
2471         //     which requires the additional check for p2 immediately following p1.
2472         //
2473         if (c1==0x0D && c2==0x0A) {
2474             continue;
2475         }
2476
2477         // Rule (3a)  Break before and after newlines (including CR and LF)
2478         //
2479         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2480             break;
2481         };
2482         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2483             break;
2484         };
2485
2486         // Rule (3c)    ZWJ x (Glue_after_ZWJ | EmojiNRK).
2487         //              Not ignoring extend chars, so peek into input text to
2488         //              get the potential ZWJ, the character immediately preceding c2.
2489         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2490         //              but char32At will get the full code point.
2491         if (fZWJSet->contains(fText->char32At(p2-1)) && (fExtendedPictSet->contains(c2) || fEmojiNRKSet->contains(c2))) {
2492             continue;
2493         }
2494
2495         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2496         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2497             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2498             continue;
2499         }
2500
2501         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2502         //
2503         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2504              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2505              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2506             continue;
2507         }
2508
2509         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2510         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2511             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2512             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2513             continue;
2514         }
2515
2516         // Rule (7a)     Hebrew_Letter x Single_Quote
2517         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2518             continue;
2519         }
2520
2521         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2522         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2523             continue;
2524         }
2525
2526         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2527         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2528             continue;
2529         }
2530
2531         // Rule (8)    Numeric x Numeric
2532         if (fNumericSet->contains(c1) &&
2533             fNumericSet->contains(c2))  {
2534             continue;
2535         }
2536
2537         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2538         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2539             fNumericSet->contains(c2))  {
2540             continue;
2541         }
2542
2543         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2544         if (fNumericSet->contains(c1) &&
2545             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2546             continue;
2547         }
2548
2549         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2550         if (fNumericSet->contains(c0) &&
2551             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2552             fNumericSet->contains(c2)) {
2553             continue;
2554         }
2555
2556         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2557         if (fNumericSet->contains(c1) &&
2558             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2559             fNumericSet->contains(c3)) {
2560             continue;
2561         }
2562
2563         // Rule (13)  Katakana x Katakana
2564         //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
2565         //                  all Katakana are handled by the dictionary breaker.
2566         if (fKatakanaSet->contains(c1) &&
2567             fKatakanaSet->contains(c2))  {
2568             continue;
2569         }
2570
2571         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2572         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2573              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2574              fExtendNumLetSet->contains(c2)) {
2575                 continue;
2576         }
2577
2578         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2579         if (fExtendNumLetSet->contains(c1) &&
2580                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2581                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2582             continue;
2583         }
2584
2585         // WB 14  (E_Base | EBG) x E_Modifier
2586         if ((fEBaseSet->contains(c1)  || fEBGSet->contains(c1)) && fEModifierSet->contains(c2)) {
2587             continue;
2588         }
2589
2590         // Rule 15 - 17   Group pairs of Regional Indicators.
2591         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2592             break;
2593         }
2594         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2595             continue;
2596         }
2597
2598         // Rule 999.  Break found here.
2599         break;
2600     }
2601
2602     breakPos = p2;
2603     return breakPos;
2604 }
2605
2606
2607 UVector  *RBBIWordMonkey::charClasses() {
2608     return fSets;
2609 }
2610
2611
2612 RBBIWordMonkey::~RBBIWordMonkey() {
2613     delete fSets;
2614     delete fCRSet;
2615     delete fLFSet;
2616     delete fNewlineSet;
2617     delete fKatakanaSet;
2618     delete fHebrew_LetterSet;
2619     delete fALetterSet;
2620     delete fSingle_QuoteSet;
2621     delete fDouble_QuoteSet;
2622     delete fMidNumLetSet;
2623     delete fMidLetterSet;
2624     delete fMidNumSet;
2625     delete fNumericSet;
2626     delete fFormatSet;
2627     delete fExtendSet;
2628     delete fExtendNumLetSet;
2629     delete fRegionalIndicatorSet;
2630     delete fDictionarySet;
2631     delete fOtherSet;
2632     delete fEBaseSet;
2633     delete fEBGSet;
2634     delete fEModifierSet;
2635     delete fZWJSet;
2636     delete fExtendedPictSet;
2637     delete fEmojiNRKSet;
2638 }
2639
2640
2641
2642
2643 //------------------------------------------------------------------------------------------
2644 //
2645 //   class RBBISentMonkey      Sentence Break specific implementation
2646 //                             of RBBIMonkeyKind.
2647 //
2648 //------------------------------------------------------------------------------------------
2649 class RBBISentMonkey: public RBBIMonkeyKind {
2650 public:
2651     RBBISentMonkey();
2652     virtual          ~RBBISentMonkey();
2653     virtual  UVector *charClasses();
2654     virtual  void     setText(const UnicodeString &s);
2655     virtual int32_t   next(int32_t i);
2656 private:
2657     int               moveBack(int posFrom);
2658     int               moveForward(int posFrom);
2659     UChar32           cAt(int pos);
2660
2661     UVector      *fSets;
2662
2663     UnicodeSet  *fSepSet;
2664     UnicodeSet  *fFormatSet;
2665     UnicodeSet  *fSpSet;
2666     UnicodeSet  *fLowerSet;
2667     UnicodeSet  *fUpperSet;
2668     UnicodeSet  *fOLetterSet;
2669     UnicodeSet  *fNumericSet;
2670     UnicodeSet  *fATermSet;
2671     UnicodeSet  *fSContinueSet;
2672     UnicodeSet  *fSTermSet;
2673     UnicodeSet  *fCloseSet;
2674     UnicodeSet  *fOtherSet;
2675     UnicodeSet  *fExtendSet;
2676
2677     const UnicodeString  *fText;
2678
2679 };
2680
2681 RBBISentMonkey::RBBISentMonkey()
2682 {
2683     UErrorCode  status = U_ZERO_ERROR;
2684
2685     fSets            = new UVector(status);
2686
2687     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2688     //                       set and made into character classes of their own.  For the monkey impl,
2689     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2690     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2691     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2692     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2693     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2694     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2695     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2696     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2697     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2698     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2699     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2700     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2701     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2702     fOtherSet        = new UnicodeSet();
2703
2704     if(U_FAILURE(status)) {
2705       deferredStatus = status;
2706       return;
2707     }
2708
2709     fOtherSet->complement();
2710     fOtherSet->removeAll(*fSepSet);
2711     fOtherSet->removeAll(*fFormatSet);
2712     fOtherSet->removeAll(*fSpSet);
2713     fOtherSet->removeAll(*fLowerSet);
2714     fOtherSet->removeAll(*fUpperSet);
2715     fOtherSet->removeAll(*fOLetterSet);
2716     fOtherSet->removeAll(*fNumericSet);
2717     fOtherSet->removeAll(*fATermSet);
2718     fOtherSet->removeAll(*fSContinueSet);
2719     fOtherSet->removeAll(*fSTermSet);
2720     fOtherSet->removeAll(*fCloseSet);
2721     fOtherSet->removeAll(*fExtendSet);
2722
2723     fSets->addElement(fSepSet,       status);
2724     fSets->addElement(fFormatSet,    status);
2725     fSets->addElement(fSpSet,        status);
2726     fSets->addElement(fLowerSet,     status);
2727     fSets->addElement(fUpperSet,     status);
2728     fSets->addElement(fOLetterSet,   status);
2729     fSets->addElement(fNumericSet,   status);
2730     fSets->addElement(fATermSet,     status);
2731     fSets->addElement(fSContinueSet, status);
2732     fSets->addElement(fSTermSet,     status);
2733     fSets->addElement(fCloseSet,     status);
2734     fSets->addElement(fOtherSet,     status);
2735     fSets->addElement(fExtendSet,    status);
2736
2737     if (U_FAILURE(status)) {
2738         deferredStatus = status;
2739     }
2740 }
2741
2742
2743
2744 void RBBISentMonkey::setText(const UnicodeString &s) {
2745     fText       = &s;
2746 }
2747
2748 UVector  *RBBISentMonkey::charClasses() {
2749     return fSets;
2750 }
2751
2752
2753 //  moveBack()   Find the "significant" code point preceding the index i.
2754 //               Skips over ($Extend | $Format)* .
2755 //
2756 int RBBISentMonkey::moveBack(int i) {
2757     if (i <= 0) {
2758         return -1;
2759     }
2760     UChar32   c;
2761     int32_t   j = i;
2762     do {
2763         j = fText->moveIndex32(j, -1);
2764         c = fText->char32At(j);
2765     }
2766     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2767     return j;
2768
2769  }
2770
2771
2772 int RBBISentMonkey::moveForward(int i) {
2773     if (i>=fText->length()) {
2774         return fText->length();
2775     }
2776     UChar32   c;
2777     int32_t   j = i;
2778     do {
2779         j = fText->moveIndex32(j, 1);
2780         c = cAt(j);
2781     }
2782     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2783     return j;
2784 }
2785
2786 UChar32 RBBISentMonkey::cAt(int pos) {
2787     if (pos<0 || pos>=fText->length()) {
2788         return -1;
2789     } else {
2790         return fText->char32At(pos);
2791     }
2792 }
2793
2794 int32_t RBBISentMonkey::next(int32_t prevPos) {
2795     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2796                               //   break position being tested.  The candidate break
2797                               //   location is before p2.
2798
2799     int     breakPos = -1;
2800
2801     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2802     UChar32 c;
2803
2804     if (U_FAILURE(deferredStatus)) {
2805         return -1;
2806     }
2807
2808     // Prev break at end of string.  return DONE.
2809     if (prevPos >= fText->length()) {
2810         return -1;
2811     }
2812     p0 = p1 = p2 = p3 = prevPos;
2813     c3 =  fText->char32At(prevPos);
2814     c0 = c1 = c2 = 0;
2815     (void)p0;     // Suppress set but not used warning.
2816
2817     // Loop runs once per "significant" character position in the input text.
2818     for (;;) {
2819         // Move all of the positions forward in the input string.
2820         p0 = p1;  c0 = c1;
2821         p1 = p2;  c1 = c2;
2822         p2 = p3;  c2 = c3;
2823
2824         // Advancd p3 by    X(Extend | Format)*   Rule 4
2825         p3 = moveForward(p3);
2826         c3 = cAt(p3);
2827
2828         // Rule (3)  CR x LF
2829         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2830             continue;
2831         }
2832
2833         // Rule (4).   Sep  <break>
2834         if (fSepSet->contains(c1)) {
2835             p2 = p1+1;   // Separators don't combine with Extend or Format.
2836             break;
2837         }
2838
2839         if (p2 >= fText->length()) {
2840             // Reached end of string.  Always a break position.
2841             break;
2842         }
2843
2844         if (p2 == prevPos) {
2845             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2846             continue;
2847         }
2848
2849         // Rule (6).   ATerm x Numeric
2850         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2851             continue;
2852         }
2853
2854         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2855         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2856                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2857             continue;
2858         }
2859
2860         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2861         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2862         //                  note to the Unicode 5.0 documents.
2863         int p8 = p1;
2864         while (fSpSet->contains(cAt(p8))) {
2865             p8 = moveBack(p8);
2866         }
2867         while (fCloseSet->contains(cAt(p8))) {
2868             p8 = moveBack(p8);
2869         }
2870         if (fATermSet->contains(cAt(p8))) {
2871             p8=p2;
2872             for (;;) {
2873                 c = cAt(p8);
2874                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2875                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2876                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2877                     break;
2878                 }
2879                 p8 = moveForward(p8);
2880             }
2881             if (fLowerSet->contains(cAt(p8))) {
2882                 continue;
2883             }
2884         }
2885
2886         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2887         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2888             p8 = p1;
2889             while (fSpSet->contains(cAt(p8))) {
2890                 p8 = moveBack(p8);
2891             }
2892             while (fCloseSet->contains(cAt(p8))) {
2893                 p8 = moveBack(p8);
2894             }
2895             c = cAt(p8);
2896             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2897                 continue;
2898             }
2899         }
2900
2901         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2902         int p9 = p1;
2903         while (fCloseSet->contains(cAt(p9))) {
2904             p9 = moveBack(p9);
2905         }
2906         c = cAt(p9);
2907         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2908             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2909                 continue;
2910             }
2911         }
2912
2913         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2914         int p10 = p1;
2915         while (fSpSet->contains(cAt(p10))) {
2916             p10 = moveBack(p10);
2917         }
2918         while (fCloseSet->contains(cAt(p10))) {
2919             p10 = moveBack(p10);
2920         }
2921         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2922             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2923                 continue;
2924             }
2925         }
2926
2927         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2928         int p11 = p1;
2929         if (fSepSet->contains(cAt(p11))) {
2930             p11 = moveBack(p11);
2931         }
2932         while (fSpSet->contains(cAt(p11))) {
2933             p11 = moveBack(p11);
2934         }
2935         while (fCloseSet->contains(cAt(p11))) {
2936             p11 = moveBack(p11);
2937         }
2938         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2939             break;
2940         }
2941
2942         //  Rule (12)  Any x Any
2943         continue;
2944     }
2945     breakPos = p2;
2946     return breakPos;
2947 }
2948
2949 RBBISentMonkey::~RBBISentMonkey() {
2950     delete fSets;
2951     delete fSepSet;
2952     delete fFormatSet;
2953     delete fSpSet;
2954     delete fLowerSet;
2955     delete fUpperSet;
2956     delete fOLetterSet;
2957     delete fNumericSet;
2958     delete fATermSet;
2959     delete fSContinueSet;
2960     delete fSTermSet;
2961     delete fCloseSet;
2962     delete fOtherSet;
2963     delete fExtendSet;
2964 }
2965
2966
2967
2968 //-------------------------------------------------------------------------------------------
2969 //
2970 //  RBBILineMonkey
2971 //
2972 //-------------------------------------------------------------------------------------------
2973
2974 class RBBILineMonkey: public RBBIMonkeyKind {
2975 public:
2976     RBBILineMonkey();
2977     virtual          ~RBBILineMonkey();
2978     virtual  UVector *charClasses();
2979     virtual  void     setText(const UnicodeString &s);
2980     virtual  int32_t  next(int32_t i);
2981     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2982 private:
2983     UVector      *fSets;
2984
2985     UnicodeSet  *fBK;
2986     UnicodeSet  *fCR;
2987     UnicodeSet  *fLF;
2988     UnicodeSet  *fCM;
2989     UnicodeSet  *fNL;
2990     UnicodeSet  *fSG;
2991     UnicodeSet  *fWJ;
2992     UnicodeSet  *fZW;
2993     UnicodeSet  *fGL;
2994     UnicodeSet  *fCB;
2995     UnicodeSet  *fSP;
2996     UnicodeSet  *fB2;
2997     UnicodeSet  *fBA;
2998     UnicodeSet  *fBB;
2999     UnicodeSet  *fHY;
3000     UnicodeSet  *fH2;
3001     UnicodeSet  *fH3;
3002     UnicodeSet  *fCL;
3003     UnicodeSet  *fCP;
3004     UnicodeSet  *fEX;
3005     UnicodeSet  *fIN;
3006     UnicodeSet  *fJL;
3007     UnicodeSet  *fJV;
3008     UnicodeSet  *fJT;
3009     UnicodeSet  *fNS;
3010     UnicodeSet  *fOP;
3011     UnicodeSet  *fQU;
3012     UnicodeSet  *fIS;
3013     UnicodeSet  *fNU;
3014     UnicodeSet  *fPO;
3015     UnicodeSet  *fPR;
3016     UnicodeSet  *fSY;
3017     UnicodeSet  *fAI;
3018     UnicodeSet  *fAL;
3019     UnicodeSet  *fCJ;
3020     UnicodeSet  *fHL;
3021     UnicodeSet  *fID;
3022     UnicodeSet  *fRI;
3023     UnicodeSet  *fXX;
3024     UnicodeSet  *fEB;
3025     UnicodeSet  *fEM;
3026     UnicodeSet  *fZJ;
3027     UnicodeSet  *fExtendedPict;
3028     UnicodeSet  *fEmojiNRK;
3029
3030     BreakIterator        *fCharBI;
3031     const UnicodeString  *fText;
3032     RegexMatcher         *fNumberMatcher;
3033 };
3034
3035 RBBILineMonkey::RBBILineMonkey() :
3036     RBBIMonkeyKind(),
3037     fSets(NULL),
3038
3039     fCharBI(NULL),
3040     fText(NULL),
3041     fNumberMatcher(NULL)
3042
3043 {
3044     if (U_FAILURE(deferredStatus)) {
3045         return;
3046     }
3047
3048     UErrorCode  status = U_ZERO_ERROR;
3049
3050     fSets  = new UVector(status);
3051
3052     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3053     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3054     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3055     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3056     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3057     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3058     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3059     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3060     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3061     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3062     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3063     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3064     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3065     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3066     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3067     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3068     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3069     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3070     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3071     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3072     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3073     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3074     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3075     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3076     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3077     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3078     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3079     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3080     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3081     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3082     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3083     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3084     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3085     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3086     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3087     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3088     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3089     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3090     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3091     fEB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Line_break=EB}] \\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F91D\\U0001F93C]"), status);
3092     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EM}]"), status);
3093     fZJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZWJ}]"), status);
3094     fEmojiNRK = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9\\u00a9\\u00ae\\u2122\\u3030\\u303d]]"), status);
3095     fExtendedPict = new UnicodeSet(gExtended_Pict, status);
3096
3097     if (U_FAILURE(status)) {
3098         deferredStatus = status;
3099         return;
3100     }
3101
3102     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3103     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3104     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3105
3106     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3107     fCM->addAll(*fZJ);     // ZWJ behaves as a CM.
3108
3109     fSets->addElement(fBK, status);
3110     fSets->addElement(fCR, status);
3111     fSets->addElement(fLF, status);
3112     fSets->addElement(fCM, status);
3113     fSets->addElement(fNL, status);
3114     fSets->addElement(fWJ, status);
3115     fSets->addElement(fZW, status);
3116     fSets->addElement(fGL, status);
3117     fSets->addElement(fCB, status);
3118     fSets->addElement(fSP, status);
3119     fSets->addElement(fB2, status);
3120     fSets->addElement(fBA, status);
3121     fSets->addElement(fBB, status);
3122     fSets->addElement(fHY, status);
3123     fSets->addElement(fH2, status);
3124     fSets->addElement(fH3, status);
3125     fSets->addElement(fCL, status);
3126     fSets->addElement(fCP, status);
3127     fSets->addElement(fEX, status);
3128     fSets->addElement(fIN, status);
3129     fSets->addElement(fJL, status);
3130     fSets->addElement(fJT, status);
3131     fSets->addElement(fJV, status);
3132     fSets->addElement(fNS, status);
3133     fSets->addElement(fOP, status);
3134     fSets->addElement(fQU, status);
3135     fSets->addElement(fIS, status);
3136     fSets->addElement(fNU, status);
3137     fSets->addElement(fPO, status);
3138     fSets->addElement(fPR, status);
3139     fSets->addElement(fSY, status);
3140     fSets->addElement(fAI, status);
3141     fSets->addElement(fAL, status);
3142     fSets->addElement(fHL, status);
3143     fSets->addElement(fID, status);
3144     fSets->addElement(fWJ, status);
3145     fSets->addElement(fRI, status);
3146     fSets->addElement(fSG, status);
3147     fSets->addElement(fEB, status);
3148     fSets->addElement(fEM, status);
3149     fSets->addElement(fZJ, status);
3150     fSets->addElement(fExtendedPict, status);
3151     fSets->addElement(fEmojiNRK, status);
3152
3153
3154     const char *rules =
3155             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?"
3156             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})(\\p{Line_Break=CM}|\\u200d)*)?"
3157             "\\p{Line_Break=NU}(\\p{Line_Break=CM}|\\u200d)*"
3158             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})(\\p{Line_Break=CM}|\\u200d)*)*"
3159             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})(\\p{Line_Break=CM}|\\u200d)*)?"
3160             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})(\\p{Line_Break=CM}|\\u200d)*)?";
3161
3162     fNumberMatcher = new RegexMatcher(
3163         UnicodeString(rules, -1, US_INV), 0, status);
3164
3165     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3166
3167     if (U_FAILURE(status)) {
3168         deferredStatus = status;
3169     }
3170 }
3171
3172
3173 void RBBILineMonkey::setText(const UnicodeString &s) {
3174     fText       = &s;
3175     fCharBI->setText(s);
3176     fNumberMatcher->reset(s);
3177 }
3178
3179 //
3180 //  rule9Adjust
3181 //     Line Break TR rules 9 and 10 implementation.
3182 //     This deals with combining marks and other sequences that
3183 //     that must be treated as if they were something other than what they actually are.
3184 //
3185 //     This is factored out into a separate function because it must be applied twice for
3186 //     each potential break, once to the chars before the position being checked, then
3187 //     again to the text following the possible break.
3188 //
3189 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3190     if (pos == -1) {
3191         // Invalid initial position.  Happens during the warmup iteration of the
3192         //   main loop in next().
3193         return;
3194     }
3195
3196     int32_t  nPos = *nextPos;
3197
3198     // LB 9  Keep combining sequences together.
3199     //  advance over any CM class chars.  Note that Line Break CM is different
3200     //  from the normal Grapheme Extend property.
3201     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3202           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3203         for (;;) {
3204             *nextChar = fText->char32At(nPos);
3205             if (!fCM->contains(*nextChar)) {
3206                 break;
3207             }
3208             nPos = fText->moveIndex32(nPos, 1);
3209         }
3210     }
3211
3212
3213     // LB 9 Treat X CM* as if it were x.
3214     //       No explicit action required.
3215
3216     // LB 10  Treat any remaining combining mark as AL
3217     if (fCM->contains(*posChar)) {
3218         *posChar = u'A';
3219     }
3220
3221     // Push the updated nextPos and nextChar back to our caller.
3222     // This only makes a difference if posChar got bigger by consuming a
3223     // combining sequence.
3224     *nextPos  = nPos;
3225     *nextChar = fText->char32At(nPos);
3226 }
3227
3228
3229
3230 int32_t RBBILineMonkey::next(int32_t startPos) {
3231     UErrorCode status = U_ZERO_ERROR;
3232     int32_t    pos;       //  Index of the char following a potential break position
3233     UChar32    thisChar;  //  Character at above position "pos"
3234
3235     int32_t    prevPos;   //  Index of the char preceding a potential break position
3236     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3237                           //   and thisChar may not be adjacent because combining
3238                           //   characters between them will be ignored.
3239
3240     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3241     UChar32    prevCharX2;
3242
3243     int32_t    nextPos;   //  Index of the next character following pos.
3244                           //     Usually skips over combining marks.
3245     int32_t    nextCPPos; //  Index of the code point following "pos."
3246                           //     May point to a combining mark.
3247     int32_t    tPos;      //  temp value.
3248     UChar32    c;
3249
3250     if (U_FAILURE(deferredStatus)) {
3251         return -1;
3252     }
3253
3254     if (startPos >= fText->length()) {
3255         return -1;
3256     }
3257
3258
3259     // Initial values for loop.  Loop will run the first time without finding breaks,
3260     //                           while the invalid values shift out and the "this" and
3261     //                           "prev" positions are filled in with good values.
3262     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3263     thisChar = prevChar  = prevCharX2 = 0;
3264     nextPos  = nextCPPos = startPos;
3265
3266
3267     // Loop runs once per position in the test text, until a break position
3268     //  is found.
3269     for (;;) {
3270         prevPosX2 = prevPos;
3271         prevCharX2 = prevChar;
3272
3273         prevPos   = pos;
3274         prevChar  = thisChar;
3275
3276         pos       = nextPos;
3277         thisChar  = fText->char32At(pos);
3278
3279         nextCPPos = fText->moveIndex32(pos, 1);
3280         nextPos   = nextCPPos;
3281
3282         // Rule LB2 - Break at end of text.
3283         if (pos >= fText->length()) {
3284             break;
3285         }
3286
3287         // Rule LB 9 - adjust for combining sequences.
3288         //             We do this one out-of-order because the adjustment does not change anything
3289         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3290         //             be applied.
3291         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3292         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3293         c = fText->char32At(nextPos);
3294         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3295
3296         // If the loop is still warming up - if we haven't shifted the initial
3297         //   -1 positions out of prevPos yet - loop back to advance the
3298         //    position in the input without any further looking for breaks.
3299         if (prevPos == -1) {
3300             continue;
3301         }
3302
3303         // LB 4  Always break after hard line breaks,
3304         if (fBK->contains(prevChar)) {
3305             break;
3306         }
3307
3308         // LB 5  Break after CR, LF, NL, but not inside CR LF
3309         if (prevChar == 0x0d && thisChar == 0x0a) {
3310             continue;
3311         }
3312         if (prevChar == 0x0d ||
3313             prevChar == 0x0a ||
3314             prevChar == 0x85)  {
3315             break;
3316         }
3317
3318         // LB 6  Don't break before hard line breaks
3319         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3320             fBK->contains(thisChar)) {
3321                 continue;
3322         }
3323
3324
3325         // LB 7  Don't break before spaces or zero-width space.
3326         if (fSP->contains(thisChar)) {
3327             continue;
3328         }
3329
3330         if (fZW->contains(thisChar)) {
3331             continue;
3332         }
3333
3334         // LB 8  Break after zero width space
3335         if (fZW->contains(prevChar)) {
3336             break;
3337         }
3338
3339         // LB 8a ZWJ x (ID | ExtendedPict | Emoji)
3340         //       The monkey test's way of ignoring combining characters doesn't work
3341         //       for this rule. ZJ is also a CM. Need to get the actual character
3342         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3343         {
3344             int32_t prevIdx = fText->moveIndex32(pos, -1);
3345             UChar32 prevC = fText->char32At(prevIdx);
3346             if (fZJ->contains(prevC) && (fID->contains(thisChar) || fExtendedPict->contains(thisChar) || fEmojiNRK->contains(thisChar))) {
3347                 continue;
3348             }
3349         }
3350
3351         // LB 9, 10  Already done, at top of loop.
3352         //
3353
3354
3355         // LB 11  Do not break before or after WORD JOINER and related characters.
3356         //    x  WJ
3357         //    WJ  x
3358         //
3359         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3360             continue;
3361         }
3362
3363         // LB 12
3364         //    GL  x
3365         if (fGL->contains(prevChar)) {
3366             continue;
3367         }
3368
3369         // LB 12a
3370         //    [^SP BA HY] x GL
3371         if (!(fSP->contains(prevChar) ||
3372               fBA->contains(prevChar) ||
3373               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3374             continue;
3375         }
3376
3377
3378
3379         // LB 13  Don't break before closings.
3380         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3381         //        fall into LB 17 and the more general number regular expression.
3382         //
3383         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3384             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3385                                          fEX->contains(thisChar)  ||
3386             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3387             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3388             continue;
3389         }
3390
3391         // LB 14 Don't break after OP SP*
3392         //       Scan backwards, checking for this sequence.
3393         //       The OP char could include combining marks, so we actually check for
3394         //           OP CM* SP*
3395         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3396         //       sequence into a ID char, so before scanning back through spaces,
3397         //       verify that prevChar is indeed a space.  The prevChar variable
3398         //       may differ from fText[prevPos]
3399         tPos = prevPos;
3400         if (fSP->contains(prevChar)) {
3401             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3402                 tPos=fText->moveIndex32(tPos, -1);
3403             }
3404         }
3405         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3406             tPos=fText->moveIndex32(tPos, -1);
3407         }
3408         if (fOP->contains(fText->char32At(tPos))) {
3409             continue;
3410         }
3411
3412
3413         // LB 15    QU SP* x OP
3414         if (fOP->contains(thisChar)) {
3415             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3416             int tPos = prevPos;
3417             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3418                 tPos = fText->moveIndex32(tPos, -1);
3419             }
3420             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3421                 tPos = fText->moveIndex32(tPos, -1);
3422             }
3423             if (fQU->contains(fText->char32At(tPos))) {
3424                 continue;
3425             }
3426         }
3427
3428
3429
3430         // LB 16   (CL | CP) SP* x NS
3431         //    Scan backwards for SP* CM* (CL | CP)
3432         if (fNS->contains(thisChar)) {
3433             int tPos = prevPos;
3434             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435                 tPos = fText->moveIndex32(tPos, -1);
3436             }
3437             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438                 tPos = fText->moveIndex32(tPos, -1);
3439             }
3440             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3441                 continue;
3442             }
3443         }
3444
3445
3446         // LB 17        B2 SP* x B2
3447         if (fB2->contains(thisChar)) {
3448             //  Scan backwards, checking for the B2 CM* SP* sequence.
3449             tPos = prevPos;
3450             if (fSP->contains(prevChar)) {
3451                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3452                     tPos=fText->moveIndex32(tPos, -1);
3453                 }
3454             }
3455             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3456                 tPos=fText->moveIndex32(tPos, -1);
3457             }
3458             if (fB2->contains(fText->char32At(tPos))) {
3459                 continue;
3460             }
3461         }
3462
3463
3464         // LB 18    break after space
3465         if (fSP->contains(prevChar)) {
3466             break;
3467         }
3468
3469         // LB 19
3470         //    x   QU
3471         //    QU  x
3472         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3473             continue;
3474         }
3475
3476         // LB 20  Break around a CB
3477         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3478             break;
3479         }
3480
3481         // LB 21
3482         if (fBA->contains(thisChar) ||
3483             fHY->contains(thisChar) ||
3484             fNS->contains(thisChar) ||
3485             fBB->contains(prevChar) )   {
3486             continue;
3487         }
3488
3489         // LB 21a
3490         //   HL (HY | BA) x
3491         if (fHL->contains(prevCharX2) &&
3492                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3493             continue;
3494         }
3495
3496         // LB 21b
3497         //   SY x HL
3498         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3499             continue;
3500         }
3501
3502         // LB 22
3503         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3504             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3505             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3506             ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) && fIN->contains(thisChar)) ||
3507             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3508             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3509             continue;
3510         }
3511
3512
3513         // LB 23    (AL | HL) x NU
3514         //          NU x (AL | HL)
3515         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fNU->contains(thisChar)) {
3516             continue;
3517         }
3518         if (fNU->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3519             continue;
3520         }
3521
3522         // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
3523         //      PR x (ID | EB | EM)
3524         //     (ID | EB | EM) x PO
3525         if (fPR->contains(prevChar) &&
3526                 (fID->contains(thisChar) || fEB->contains(thisChar) || fEM->contains(thisChar)))  {
3527             continue;
3528         }
3529         if ((fID->contains(prevChar) || fEB->contains(prevChar) || fEM->contains(prevChar)) &&
3530                 fPO->contains(thisChar)) {
3531             continue;
3532         }
3533
3534         // LB 24  Do not break between prefix and letters or ideographs.
3535         //         (PR | PO) x (AL | HL)
3536         //         (AL | HL) x (PR | PO)
3537         if ((fPR->contains(prevChar) || fPO->contains(prevChar)) &&
3538                 (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3539             continue;
3540         }
3541         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) &&
3542                 (fPR->contains(thisChar) || fPO->contains(thisChar))) {
3543             continue;
3544         }
3545
3546
3547
3548         // LB 25    Numbers
3549         if (fNumberMatcher->lookingAt(prevPos, status)) {
3550             if (U_FAILURE(status)) {
3551                 break;
3552             }
3553             // Matched a number.  But could have been just a single digit, which would
3554             //    not represent a "no break here" between prevChar and thisChar
3555             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3556             if (numEndIdx > pos) {
3557                 // Number match includes at least our two chars being checked
3558                 if (numEndIdx > nextPos) {
3559                     // Number match includes additional chars.  Update pos and nextPos
3560                     //   so that next loop iteration will continue at the end of the number,
3561                     //   checking for breaks between last char in number & whatever follows.
3562                     pos = nextPos = numEndIdx;
3563                     do {
3564                         pos = fText->moveIndex32(pos, -1);
3565                         thisChar = fText->char32At(pos);
3566                     } while (fCM->contains(thisChar));
3567                 }
3568                 continue;
3569             }
3570         }
3571
3572
3573         // LB 26 Do not break a Korean syllable.
3574         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3575                                         fJV->contains(thisChar) ||
3576                                         fH2->contains(thisChar) ||
3577                                         fH3->contains(thisChar))) {
3578                                             continue;
3579                                         }
3580
3581         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3582             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3583                 continue;
3584         }
3585
3586         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3587             fJT->contains(thisChar)) {
3588                 continue;
3589         }
3590
3591         // LB 27 Treat a Korean Syllable Block the same as ID.
3592         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3593             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3594             fIN->contains(thisChar)) {
3595                 continue;
3596             }
3597         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3598             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3599             fPO->contains(thisChar)) {
3600                 continue;
3601             }
3602         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3603             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3604                 continue;
3605             }
3606
3607
3608
3609         // LB 28  Do not break between alphabetics ("at").
3610         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3611             continue;
3612         }
3613
3614         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3615         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3616             continue;
3617         }
3618
3619         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3620         //          (AL | NU) x OP
3621         //          CP x (AL | NU)
3622         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3623             continue;
3624         }
3625         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3626             continue;
3627         }
3628
3629         // LB30a    RI RI <break> RI
3630         //             RI    x    RI
3631         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3632             break;
3633         }
3634         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3635             continue;
3636         }
3637
3638         // LB30b    Emoji Base x Emoji Modifier
3639         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3640             continue;
3641         }
3642
3643         // LB 31    Break everywhere else
3644         break;
3645
3646     }
3647
3648     return pos;
3649 }
3650
3651
3652 UVector  *RBBILineMonkey::charClasses() {
3653     return fSets;
3654 }
3655
3656
3657 RBBILineMonkey::~RBBILineMonkey() {
3658     delete fSets;
3659
3660     delete fBK;
3661     delete fCR;
3662     delete fLF;
3663     delete fCM;
3664     delete fNL;
3665     delete fWJ;
3666     delete fZW;
3667     delete fGL;
3668     delete fCB;
3669     delete fSP;
3670     delete fB2;
3671     delete fBA;
3672     delete fBB;
3673     delete fHY;
3674     delete fH2;
3675     delete fH3;
3676     delete fCL;
3677     delete fCP;
3678     delete fEX;
3679     delete fIN;
3680     delete fJL;
3681     delete fJV;
3682     delete fJT;
3683     delete fNS;
3684     delete fOP;
3685     delete fQU;
3686     delete fIS;
3687     delete fNU;
3688     delete fPO;
3689     delete fPR;
3690     delete fSY;
3691     delete fAI;
3692     delete fAL;
3693     delete fCJ;
3694     delete fHL;
3695     delete fID;
3696     delete fRI;
3697     delete fSG;
3698     delete fXX;
3699     delete fEB;
3700     delete fEM;
3701     delete fZJ;
3702     delete fExtendedPict;
3703     delete fEmojiNRK;
3704
3705     delete fCharBI;
3706     delete fNumberMatcher;
3707 }
3708
3709
3710 //-------------------------------------------------------------------------------------------
3711 //
3712 //   TestMonkey
3713 //
3714 //     params
3715 //       seed=nnnnn        Random number starting seed.
3716 //                         Setting the seed allows errors to be reproduced.
3717 //       loop=nnn          Looping count.  Controls running time.
3718 //                         -1:  run forever.
3719 //                          0 or greater:  run length.
3720 //
3721 //       type = char | word | line | sent | title
3722 //
3723 //  Example:
3724 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3725 //
3726 //-------------------------------------------------------------------------------------------
3727
3728 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3729     int32_t val = defaultVal;
3730     name.append(" *= *(-?\\d+)");
3731     UErrorCode status = U_ZERO_ERROR;
3732     RegexMatcher m(name, params, 0, status);
3733     if (m.find()) {
3734         // The param exists.  Convert the string to an int.
3735         char valString[100];
3736         int32_t paramLength = m.end(1, status) - m.start(1, status);
3737         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3738             paramLength = (int32_t)(sizeof(valString)-2);
3739         }
3740         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3741         val = strtol(valString,  NULL, 10);
3742
3743         // Delete this parameter from the params string.
3744         m.reset();
3745         params = m.replaceFirst("", status);
3746     }
3747     U_ASSERT(U_SUCCESS(status));
3748     return val;
3749 }
3750 #endif
3751
3752 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3753 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3754                                     BreakIterator *bi,
3755                                     int expected[],
3756                                     int expectedcount)
3757 {
3758     int count = 0;
3759     int i = 0;
3760     int forward[50];
3761     bi->setText(ustr);
3762     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3763         forward[count] = i;
3764         if (count < expectedcount && expected[count] != i) {
3765             test->errln("break forward test failed: expected %d but got %d",
3766                         expected[count], i);
3767             break;
3768         }
3769         count ++;
3770     }
3771     if (count != expectedcount) {
3772         printStringBreaks(ustr, expected, expectedcount);
3773         test->errln("break forward test failed: missed %d match",
3774                     expectedcount - count);
3775         return;
3776     }
3777     // testing boundaries
3778     for (i = 1; i < expectedcount; i ++) {
3779         int j = expected[i - 1];
3780         if (!bi->isBoundary(j)) {
3781             printStringBreaks(ustr, expected, expectedcount);
3782             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3783             return;
3784         }
3785         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3786             if (bi->isBoundary(j)) {
3787                 printStringBreaks(ustr, expected, expectedcount);
3788                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3789                 return;
3790             }
3791         }
3792     }
3793
3794     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3795         count --;
3796         if (forward[count] != i) {
3797             printStringBreaks(ustr, expected, expectedcount);
3798             test->errln("happy break test previous() failed: expected %d but got %d",
3799                         forward[count], i);
3800             break;
3801         }
3802     }
3803     if (count != 0) {
3804         printStringBreaks(ustr, expected, expectedcount);
3805         test->errln("break test previous() failed: missed a match");
3806         return;
3807     }
3808
3809     // testing preceding
3810     for (i = 0; i < expectedcount - 1; i ++) {
3811         // int j = expected[i] + 1;
3812         int j = ustr.moveIndex32(expected[i], 1);
3813         for (; j <= expected[i + 1]; j ++) {
3814             if (bi->preceding(j) != expected[i]) {
3815                 printStringBreaks(ustr, expected, expectedcount);
3816                 test->errln("preceding(): Not expecting boundary at position %d", j);
3817                 return;
3818             }
3819         }
3820     }
3821 }
3822 #endif
3823
3824 void RBBITest::TestWordBreaks(void)
3825 {
3826 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3827
3828     Locale        locale("en");
3829     UErrorCode    status = U_ZERO_ERROR;
3830     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3831     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3832     // Replaced any C+J characters in a row with a random sequence of characters
3833     // of the same length to make our C+J segmentation not get in the way.
3834     static const char *strlist[] =
3835     {
3836     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3837     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3838     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3839     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3840     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3841     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3842     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3843     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3844     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3845     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3846     "\\u2027\\U000e0067\\u0a47\\u00b7",
3847     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3848     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3849     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3850     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3851     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3852     "\\u0027\\u11af\\U000e0057\\u0602",
3853     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3854     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3855     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3856     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3857     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3858     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3859     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3860     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3861     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3862     "\\u18f4\\U000e0049\\u20e7\\u2027",
3863     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3864     "\\ua183\\u102d\\u0bec\\u003a",
3865     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3866     "\\u003a\\u0e57\\u0fad\\u002e",
3867     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3868     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3869     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3870     "\\u003a\\u0664\\u00b7\\u1fba",
3871     "\\u003b\\u0027\\u00b7\\u47a3",
3872     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3873     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3874     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3875     };
3876     int loop;
3877     if (U_FAILURE(status)) {
3878         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3879         return;
3880     }
3881     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3882         // printf("looping %d\n", loop);
3883         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3884         // RBBICharMonkey monkey;
3885         RBBIWordMonkey monkey;
3886
3887         int expected[50];
3888         int expectedcount = 0;
3889
3890         monkey.setText(ustr);
3891         int i;
3892         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3893             expected[expectedcount ++] = i;
3894         }
3895
3896         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3897     }
3898     delete bi;
3899 #endif
3900 }
3901
3902 void RBBITest::TestWordBoundary(void)
3903 {
3904     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3905     Locale        locale("en");
3906     UErrorCode    status = U_ZERO_ERROR;
3907     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3908     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3909     UChar         str[50];
3910     static const char *strlist[] =
3911     {
3912     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3913     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3914     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3915     "\\u2027\\U000e0067\\u0a47\\u00b7",
3916     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3917     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3918     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3919     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3920     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3921     "\\u0027\\u11af\\U000e0057\\u0602",
3922     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3923     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3924     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3925     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3926     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3927     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3928     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3929     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3930     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3931     "\\u58f4\\U000e0049\\u20e7\\u2027",
3932     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3933     "\\ua183\\u102d\\u0bec\\u003a",
3934     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3935     "\\u003a\\u0e57\\u0fad\\u002e",
3936     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3937     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3938     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3939     "\\u003a\\u0664\\u00b7\\u1fba",
3940     "\\u003b\\u0027\\u00b7\\u47a3",
3941     };
3942     int loop;
3943     if (U_FAILURE(status)) {
3944         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3945         return;
3946     }
3947     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3948         // printf("looping %d\n", loop);
3949         u_unescape(strlist[loop], str, 20);
3950         UnicodeString ustr(str);
3951         int forward[50];
3952         int count = 0;
3953
3954         bi->setText(ustr);
3955         int prev = 0;
3956         int i;
3957         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3958             forward[count ++] = i;
3959             if (i > prev) {
3960                 int j;
3961                 for (j = prev + 1; j < i; j ++) {
3962                     if (bi->isBoundary(j)) {
3963                         printStringBreaks(ustr, forward, count);
3964                         errln("happy boundary test failed: expected %d not a boundary",
3965                                j);
3966                         return;
3967                     }
3968                 }
3969             }
3970             if (!bi->isBoundary(i)) {
3971                 printStringBreaks(ustr, forward, count);
3972                 errln("happy boundary test failed: expected %d a boundary",
3973                        i);
3974                 return;
3975             }
3976             prev = i;
3977         }
3978     }
3979     delete bi;
3980 }
3981
3982 void RBBITest::TestLineBreaks(void)
3983 {
3984 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3985     Locale        locale("en");
3986     UErrorCode    status = U_ZERO_ERROR;
3987     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3988     const int32_t  STRSIZE = 50;
3989     UChar         str[STRSIZE];
3990     static const char *strlist[] =
3991     {
3992      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3993      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3994              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3995      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3996              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3997      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3998      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3999      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4000      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4001      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4002      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4003      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4004      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4005      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4006      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4007      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4008      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4009      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4010      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4011      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4012      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4013      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4014      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4015      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4016      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4017      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4018      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4019      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4020      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4021      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4022      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4023      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4024      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4025      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4026      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4027      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4028      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4029      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4030      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4031          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4032     };
4033     int loop;
4034     TEST_ASSERT_SUCCESS(status);
4035     if (U_FAILURE(status)) {
4036         return;
4037     }
4038     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4039         // printf("looping %d\n", loop);
4040         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4041         if (t >= STRSIZE) {
4042             TEST_ASSERT(FALSE);
4043             continue;
4044         }
4045
4046
4047         UnicodeString ustr(str);
4048         RBBILineMonkey monkey;
4049         if (U_FAILURE(monkey.deferredStatus)) {
4050             continue;
4051         }
4052
4053         const int EXPECTEDSIZE = 50;
4054         int expected[EXPECTEDSIZE];
4055         int expectedcount = 0;
4056
4057         monkey.setText(ustr);
4058         int i;
4059         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4060             if (expectedcount >= EXPECTEDSIZE) {
4061                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4062                 return;
4063             }
4064             expected[expectedcount ++] = i;
4065         }
4066
4067         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4068     }
4069     delete bi;
4070 #endif
4071 }
4072
4073 void RBBITest::TestSentBreaks(void)
4074 {
4075 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4076     Locale        locale("en");
4077     UErrorCode    status = U_ZERO_ERROR;
4078     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4079     UChar         str[200];
4080     static const char *strlist[] =
4081     {
4082      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4083      "This\n",
4084      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4085      "\"Sentence ending with a quote.\" Bye.",
4086      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4087      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4088      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4089      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4090      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4091      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4092      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4093              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4094              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4095              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4096      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4097              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4098              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4099              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4100              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4101              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4102     };
4103     int loop;
4104     if (U_FAILURE(status)) {
4105         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4106         return;
4107     }
4108     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4109         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4110         UnicodeString ustr(str);
4111
4112         RBBISentMonkey monkey;
4113         if (U_FAILURE(monkey.deferredStatus)) {
4114             continue;
4115         }
4116
4117         const int EXPECTEDSIZE = 50;
4118         int expected[EXPECTEDSIZE];
4119         int expectedcount = 0;
4120
4121         monkey.setText(ustr);
4122         int i;
4123         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4124             if (expectedcount >= EXPECTEDSIZE) {
4125                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4126                 return;
4127             }
4128             expected[expectedcount ++] = i;
4129         }
4130
4131         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4132     }
4133     delete bi;
4134 #endif
4135 }
4136
4137 void RBBITest::TestMonkey() {
4138 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4139
4140     UErrorCode     status    = U_ZERO_ERROR;
4141     int32_t        loopCount = 500;
4142     int32_t        seed      = 1;
4143     UnicodeString  breakType = "all";
4144     Locale         locale("en");
4145     UBool          useUText  = FALSE;
4146
4147     if (quick == FALSE) {
4148         loopCount = 10000;
4149     }
4150
4151     if (fTestParams) {
4152         UnicodeString p(fTestParams);
4153         loopCount = getIntParam("loop", p, loopCount);
4154         seed      = getIntParam("seed", p, seed);
4155
4156         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4157         if (m.find()) {
4158             breakType = m.group(1, status);
4159             m.reset();
4160             p = m.replaceFirst("", status);
4161         }
4162
4163         RegexMatcher u(" *utext", p, 0, status);
4164         if (u.find()) {
4165             useUText = TRUE;
4166             u.reset();
4167             p = u.replaceFirst("", status);
4168         }
4169
4170
4171         // m.reset(p);
4172         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4173             // Each option is stripped out of the option string as it is processed.
4174             // All options have been checked.  The option string should have been completely emptied..
4175             char buf[100];
4176             p.extract(buf, sizeof(buf), NULL, status);
4177             buf[sizeof(buf)-1] = 0;
4178             errln("Unrecognized or extra parameter:  %s\n", buf);
4179             return;
4180         }
4181
4182     }
4183
4184     if (breakType == "char" || breakType == "all") {
4185         RBBICharMonkey  m;
4186         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4187         if (U_SUCCESS(status)) {
4188             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4189             if (breakType == "all" && useUText==FALSE) {
4190                 // Also run a quick test with UText when "all" is specified
4191                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4192             }
4193         }
4194         else {
4195             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4196         }
4197         delete bi;
4198     }
4199
4200     if (breakType == "word" || breakType == "all") {
4201         logln("Word Break Monkey Test");
4202         RBBIWordMonkey  m;
4203         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4204         if (U_SUCCESS(status)) {
4205             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4206         }
4207         else {
4208             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4209         }
4210         delete bi;
4211     }
4212
4213     if (breakType == "line" || breakType == "all") {
4214         logln("Line Break Monkey Test");
4215         RBBILineMonkey  m;
4216         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4217         if (loopCount >= 10) {
4218             loopCount = loopCount / 5;   // Line break runs slower than the others.
4219         }
4220         if (U_SUCCESS(status)) {
4221             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4222         }
4223         else {
4224             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4225         }
4226         delete bi;
4227     }
4228
4229     if (breakType == "sent" || breakType == "all"  ) {
4230         logln("Sentence Break Monkey Test");
4231         RBBISentMonkey  m;
4232         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4233         if (loopCount >= 10) {
4234             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4235         }
4236         if (U_SUCCESS(status)) {
4237             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4238         }
4239         else {
4240             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4241         }
4242         delete bi;
4243     }
4244
4245 #endif
4246 }
4247
4248 //
4249 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4250 //    Parameters:
4251 //       bi      - the break iterator to use
4252 //       mk      - MonkeyKind, abstraction for obtaining expected results
4253 //       name    - Name of test (char, word, etc.) for use in error messages
4254 //       seed    - Seed for starting random number generator (parameter from user)
4255 //       numIterations
4256 //
4257 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4258                          int32_t numIterations, UBool useUText) {
4259
4260 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4261
4262     const int32_t    TESTSTRINGLEN = 500;
4263     UnicodeString    testText;
4264     int32_t          numCharClasses;
4265     UVector          *chClasses;
4266     int              expected[TESTSTRINGLEN*2 + 1];
4267     int              expectedCount = 0;
4268     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4269     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4270     char             reverseBreaks[TESTSTRINGLEN*2+1];
4271     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4272     char             followingBreaks[TESTSTRINGLEN*2+1];
4273     char             precedingBreaks[TESTSTRINGLEN*2+1];
4274     int              i;
4275     int              loopCount = 0;
4276
4277     m_seed = seed;
4278
4279     numCharClasses = mk.charClasses()->size();
4280     chClasses      = mk.charClasses();
4281
4282     // Check for errors that occured during the construction of the MonkeyKind object.
4283     //  Can't report them where they occured because errln() is a method coming from intlTest,
4284     //  and is not visible outside of RBBITest :-(
4285     if (U_FAILURE(mk.deferredStatus)) {
4286         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4287         return;
4288     }
4289
4290     // Verify that the character classes all have at least one member.
4291     for (i=0; i<numCharClasses; i++) {
4292         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4293         if (s == NULL || s->size() == 0) {
4294             errln("Character Class #%d is null or of zero size.", i);
4295             return;
4296         }
4297     }
4298
4299     while (loopCount < numIterations || numIterations == -1) {
4300         if (numIterations == -1 && loopCount % 10 == 0) {
4301             // If test is running in an infinite loop, display a periodic tic so
4302             //   we can tell that it is making progress.
4303             fprintf(stderr, ".");
4304         }
4305         // Save current random number seed, so that we can recreate the random numbers
4306         //   for this loop iteration in event of an error.
4307         seed = m_seed;
4308
4309         // Populate a test string with data.
4310         testText.truncate(0);
4311         for (i=0; i<TESTSTRINGLEN; i++) {
4312             int32_t  aClassNum = m_rand() % numCharClasses;
4313             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4314             int32_t   charIdx = m_rand() % classSet->size();
4315             UChar32   c = classSet->charAt(charIdx);
4316             if (c < 0) {   // TODO:  deal with sets containing strings.
4317                 errln("%s:%d c < 0", __FILE__, __LINE__);
4318                 break;
4319             }
4320             // Do not assemble a supplementary character from randomly generated separate surrogates.
4321             //   (It could be a dictionary character)
4322             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4323                 continue;
4324             }
4325
4326             testText.append(c);
4327         }
4328
4329         // Calculate the expected results for this test string.
4330         mk.setText(testText);
4331         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4332         expectedBreaks[0] = 1;
4333         int32_t breakPos = 0;
4334         expectedCount = 0;
4335         for (;;) {
4336             breakPos = mk.next(breakPos);
4337             if (breakPos == -1) {
4338                 break;
4339             }
4340             if (breakPos > testText.length()) {
4341                 errln("breakPos > testText.length()");
4342             }
4343             expectedBreaks[breakPos] = 1;
4344             U_ASSERT(expectedCount<testText.length());
4345             expected[expectedCount ++] = breakPos;
4346             (void)expected;   // Set but not used warning.
4347                               // TODO (andy): check it out.
4348         }
4349
4350         // Find the break positions using forward iteration
4351         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4352         if (useUText) {
4353             UErrorCode status = U_ZERO_ERROR;
4354             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4355             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4356             bi->setText(testUText, status);
4357             TEST_ASSERT_SUCCESS(status);
4358             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4359                                       //  This UText can be closed immediately, so long as the
4360                                       //  testText string continues to exist.
4361         } else {
4362             bi->setText(testText);
4363         }
4364
4365         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4366             if (i < 0 || i > testText.length()) {
4367                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4368                 break;
4369             }
4370             forwardBreaks[i] = 1;
4371         }
4372
4373         // Find the break positions using reverse iteration
4374         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4375         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4376             if (i < 0 || i > testText.length()) {
4377                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4378                 break;
4379             }
4380             reverseBreaks[i] = 1;
4381         }
4382
4383         // Find the break positions using isBoundary() tests.
4384         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4385         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4386         for (i=0; i<=testText.length(); i++) {
4387             isBoundaryBreaks[i] = bi->isBoundary(i);
4388         }
4389
4390
4391         // Find the break positions using the following() function.
4392         // printf(".");
4393         memset(followingBreaks, 0, sizeof(followingBreaks));
4394         int32_t   lastBreakPos = 0;
4395         followingBreaks[0] = 1;
4396         for (i=0; i<testText.length(); i++) {
4397             breakPos = bi->following(i);
4398             if (breakPos <= i ||
4399                 breakPos < lastBreakPos ||
4400                 breakPos > testText.length() ||
4401                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4402                 errln("%s break monkey test: "
4403                     "Out of range value returned by BreakIterator::following().\n"
4404                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4405                          name, seed, i, breakPos, lastBreakPos);
4406                 break;
4407             }
4408             followingBreaks[breakPos] = 1;
4409             lastBreakPos = breakPos;
4410         }
4411
4412         // Find the break positions using the preceding() function.
4413         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4414         lastBreakPos = testText.length();
4415         precedingBreaks[testText.length()] = 1;
4416         for (i=testText.length(); i>0; i--) {
4417             breakPos = bi->preceding(i);
4418             if (breakPos >= i ||
4419                 breakPos > lastBreakPos ||
4420                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4421                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4422                 errln("%s break monkey test: "
4423                     "Out of range value returned by BreakIterator::preceding().\n"
4424                     "index=%d;  prev returned %d; lastBreak=%d" ,
4425                     name,  i, breakPos, lastBreakPos);
4426                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4427                     precedingBreaks[i] = 2;   // Forces an error.
4428                 }
4429             } else {
4430                 if (breakPos >= 0) {
4431                     precedingBreaks[breakPos] = 1;
4432                 }
4433                 lastBreakPos = breakPos;
4434             }
4435         }
4436
4437         // Compare the expected and actual results.
4438         for (i=0; i<=testText.length(); i++) {
4439             const char *errorType = NULL;
4440             if  (forwardBreaks[i] != expectedBreaks[i]) {
4441                 errorType = "next()";
4442             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4443                 errorType = "previous()";
4444             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4445                 errorType = "isBoundary()";
4446             } else if (followingBreaks[i] != expectedBreaks[i]) {
4447                 errorType = "following()";
4448             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4449                 errorType = "preceding()";
4450             }
4451
4452
4453             if (errorType != NULL) {
4454                 // Format a range of the test text that includes the failure as
4455                 //  a data item that can be included in the rbbi test data file.
4456
4457                 // Start of the range is the last point where expected and actual results
4458                 //   both agreed that there was a break position.
4459                 int startContext = i;
4460                 int32_t count = 0;
4461                 for (;;) {
4462                     if (startContext==0) { break; }
4463                     startContext --;
4464                     if (expectedBreaks[startContext] != 0) {
4465                         if (count == 2) break;
4466                         count ++;
4467                     }
4468                 }
4469
4470                 // End of range is two expected breaks past the start position.
4471                 int endContext = i + 1;
4472                 int ci;
4473                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4474                     for (;;) {
4475                         if (endContext >= testText.length()) {break;}
4476                         if (expectedBreaks[endContext-1] != 0) {
4477                             if (count == 0) break;
4478                             count --;
4479                         }
4480                         endContext ++;
4481                     }
4482                 }
4483
4484                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4485                 UnicodeString errorText = "<data>";
4486                 /***if (strcmp(errorType, "next()") == 0) {
4487                     startContext = 0;
4488                     endContext = testText.length();
4489
4490                     printStringBreaks(testText, expected, expectedCount);
4491                 }***/
4492
4493                 for (ci=startContext; ci<endContext;) {
4494                     UnicodeString hexChars("0123456789abcdef");
4495                     UChar32  c;
4496                     int      bn;
4497                     c = testText.char32At(ci);
4498                     if (ci == i) {
4499                         // This is the location of the error.
4500                         errorText.append("<?>");
4501                     } else if (expectedBreaks[ci] != 0) {
4502                         // This a non-error expected break position.
4503                         errorText.append("\\");
4504                     }
4505                     if (c < 0x10000) {
4506                         errorText.append("\\u");
4507                         for (bn=12; bn>=0; bn-=4) {
4508                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4509                         }
4510                     } else {
4511                         errorText.append("\\U");
4512                         for (bn=28; bn>=0; bn-=4) {
4513                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4514                         }
4515                     }
4516                     ci = testText.moveIndex32(ci, 1);
4517                 }
4518                 errorText.append("\\");
4519                 errorText.append("</data>\n");
4520
4521                 // Output the error
4522                 char  charErrorTxt[500];
4523                 UErrorCode status = U_ZERO_ERROR;
4524                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4525                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4526                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4527
4528                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4529                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4530                     errorType, seed, i, charErrorTxt);
4531                 break;
4532             }
4533         }
4534
4535         loopCount++;
4536     }
4537 #endif
4538 }
4539
4540
4541 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4542 //             This test checks the initial patch,
4543 //             which is to just keep it from crashing.  Correct word boundaries
4544 //             await a proper fix to the dictionary code.
4545 //
4546 void RBBITest::TestBug5532(void)  {
4547    // Text includes a mixture of Thai and Latin.
4548    const unsigned char utf8Data[] = {
4549            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4550            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4551            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4552            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4553            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4554            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4555            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4556            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4557            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4558            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4559            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4560
4561     UErrorCode status = U_ZERO_ERROR;
4562     UText utext=UTEXT_INITIALIZER;
4563     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4564     TEST_ASSERT_SUCCESS(status);
4565
4566     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4567     TEST_ASSERT_SUCCESS(status);
4568     if (U_SUCCESS(status)) {
4569         bi->setText(&utext, status);
4570         TEST_ASSERT_SUCCESS(status);
4571
4572         int32_t breakCount = 0;
4573         int32_t previousBreak = -1;
4574         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4575             // For now, just make sure that the break iterator doesn't hang.
4576             TEST_ASSERT(previousBreak < bi->current());
4577             previousBreak = bi->current();
4578         }
4579         TEST_ASSERT(breakCount > 0);
4580     }
4581     delete bi;
4582     utext_close(&utext);
4583 }
4584
4585
4586 void RBBITest::TestBug9983(void)  {
4587     UnicodeString text = UnicodeString("\\u002A"  // * Other
4588                                        "\\uFF65"  //   Other
4589                                        "\\u309C"  //   Katakana
4590                                        "\\uFF9F"  //   Extend
4591                                        "\\uFF65"  //   Other
4592                                        "\\u0020"  //   Other
4593                                        "\\u0000").unescape();
4594
4595     UErrorCode status = U_ZERO_ERROR;
4596     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4597         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4598     TEST_ASSERT_SUCCESS(status);
4599     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4600         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4601     TEST_ASSERT_SUCCESS(status);
4602     if (U_FAILURE(status)) {
4603         return;
4604     }
4605     int32_t offset, rstatus, iterationCount;
4606
4607     brkiter->setText(text);
4608     brkiter->last();
4609     iterationCount = 0;
4610     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4611         iterationCount++;
4612         rstatus = brkiter->getRuleStatus();
4613         (void)rstatus;     // Suppress set but not used warning.
4614         if (iterationCount >= 10) {
4615            break;
4616         }
4617     }
4618     TEST_ASSERT(iterationCount == 6);
4619
4620     brkiterPOSIX->setText(text);
4621     brkiterPOSIX->last();
4622     iterationCount = 0;
4623     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4624         iterationCount++;
4625         rstatus = brkiterPOSIX->getRuleStatus();
4626         (void)rstatus;     // Suppress set but not used warning.
4627         if (iterationCount >= 10) {
4628            break;
4629         }
4630     }
4631     TEST_ASSERT(iterationCount == 6);
4632 }
4633
4634 // Bug 7547 - verify that building a break itereator from empty rules produces an error.
4635 //
4636 void RBBITest::TestBug7547() {
4637     UnicodeString rules;
4638     UErrorCode status = U_ZERO_ERROR;
4639     UParseError parseError;
4640     RuleBasedBreakIterator breakIterator(rules, parseError, status);
4641     if (status != U_BRK_RULE_SYNTAX) {
4642         errln("%s:%d Expected U_BRK_RULE_SYNTAX, got %s", __FILE__, __LINE__, u_errorName(status));
4643     }
4644     if (parseError.line != 1 || parseError.offset != 0) {
4645         errln("parseError (line, offset) expected (1, 0), got (%d, %d)", parseError.line, parseError.offset);
4646     }
4647 }
4648
4649
4650 void RBBITest::TestBug12797() {
4651     UnicodeString rules = "!!chain; !!forward; $v=b c; a b; $v; !!reverse; .*;";
4652     UErrorCode status = U_ZERO_ERROR;
4653     UParseError parseError;
4654     RuleBasedBreakIterator bi(rules, parseError, status);
4655     if (U_FAILURE(status)) {
4656         errln("%s:%s status = %s", __FILE__, __LINE__, u_errorName(status));
4657         return;
4658     }
4659     UnicodeString text = "abc";
4660     bi.setText(text);
4661     bi.first();
4662     int32_t boundary = bi.next();
4663     if (boundary != 3) {
4664         errln("%s:%d expected boundary==3, got %d", __FILE__, __LINE__, boundary);
4665     }
4666 }
4667
4668 void RBBITest::TestBug12918() {
4669     // This test triggers an assertion failure in dictbe.cpp
4670     const UChar *crasherString = u"\u3325\u4a16";
4671     UErrorCode status = U_ZERO_ERROR;
4672     UBreakIterator* iter = ubrk_open(UBRK_WORD, NULL, crasherString, -1, &status);
4673     if (U_FAILURE(status)) {
4674         dataerrln("%s:%d status = %s", __FILE__, __LINE__, u_errorName(status));
4675         return;
4676     }
4677     ubrk_first(iter);
4678     int32_t pos = 0;
4679     int32_t lastPos = -1;
4680     while((pos = ubrk_next(iter)) != UBRK_DONE) {
4681         if (pos <= lastPos) {
4682             errln("%s:%d (pos, lastPos) = (%d, %d)", __FILE__, __LINE__, pos, lastPos);
4683             break;
4684         }
4685     }
4686     ubrk_close(iter);
4687 }
4688
4689 void RBBITest::TestBug12932() {
4690     // Node Stack overflow in the RBBI rule parser caused a seg fault.
4691     UnicodeString ruleStr(
4692             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4693             "((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((("
4694             "(((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((()))"
4695             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4696             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))"
4697             ")))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))))");
4698
4699     UErrorCode status = U_ZERO_ERROR;
4700     UParseError parseError;
4701     RuleBasedBreakIterator rbbi(ruleStr, parseError, status);
4702     if (status != U_BRK_RULE_SYNTAX) {
4703         errln("%s:%d expected U_BRK_RULE_SYNTAX, got %s",
4704                 __FILE__, __LINE__, u_errorName(status));
4705     }
4706 }
4707
4708
4709 // Emoji Test. Verify that the sequences defined in the Unicode data file emoji-test.txt
4710 //             remain undevided by ICU char, word and line break.
4711 void RBBITest::TestEmoji() {
4712     UErrorCode  status = U_ZERO_ERROR;
4713
4714     CharString testFileName;
4715     testFileName.append(IntlTest::getSourceTestData(status), status);
4716     testFileName.appendPathPart("emoji-test.txt", status);
4717     if (U_FAILURE(status)) {
4718         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4719         return;
4720     }
4721     logln("Opening data file %s\n", testFileName.data());
4722
4723     int    len;
4724     UChar *testFile = ReadAndConvertFile(testFileName.data(), len, "UTF-8", status);
4725     if (U_FAILURE(status) || testFile == NULL) {
4726         errln("%s:%s %s while opening emoji-test.txt", __FILE__, __LINE__, u_errorName(status));
4727         return;
4728     }
4729     UnicodeString testFileAsString(testFile, len);
4730     delete [] testFile;
4731
4732     RegexMatcher lineMatcher(u"^.*?$", testFileAsString, UREGEX_MULTILINE, status);
4733     RegexMatcher hexMatcher(u"\\s*([a-f0-9]*)", UREGEX_CASE_INSENSITIVE, status);
4734     //           hexMatcher group(1) is a hex number, or empty string if no hex number present.
4735     int32_t lineNumber = 0;
4736
4737     LocalPointer<BreakIterator> charBreaks(BreakIterator::createCharacterInstance(Locale::getEnglish(), status), status);
4738     LocalPointer<BreakIterator> wordBreaks(BreakIterator::createWordInstance(Locale::getEnglish(), status), status);
4739     LocalPointer<BreakIterator> lineBreaks(BreakIterator::createLineInstance(Locale::getEnglish(), status), status);
4740     if (U_FAILURE(status)) {
4741         dataerrln("%s:%d %s while opening break iterators", __FILE__, __LINE__, u_errorName(status));
4742         return;
4743     }
4744
4745     while (lineMatcher.find()) {
4746         ++lineNumber;
4747         UnicodeString line = lineMatcher.group(status);
4748         hexMatcher.reset(line);
4749         UnicodeString testString;   // accumulates the emoji sequence.
4750         while (hexMatcher.find() && hexMatcher.group(1, status).length() > 0) {
4751             UnicodeString hex = hexMatcher.group(1, status);
4752             if (hex.length() > 8) {
4753                 errln("%s:%d emoji-test.txt:%d invalid code point %s", __FILE__, __LINE__, lineNumber, CStr(hex)());
4754                 break;
4755             }
4756             CharString hex8;
4757             hex8.appendInvariantChars(hex, status);
4758             UChar32 c = (UChar32)strtol(hex8.data(), NULL, 16);
4759             if (c<=0x10ffff) {
4760                 testString.append(c);
4761             } else {
4762                 errln("%s:%d emoji-test.txt:%d Error: Unicode Character %s value out of range.",
4763                         __FILE__, __LINE__, lineNumber, hex8.data());
4764                 break;
4765             }
4766         }
4767
4768         if (testString.length() > 1) {
4769             charBreaks->setText(testString);
4770             charBreaks->first();
4771             int32_t firstBreak = charBreaks->next();
4772             if (testString.length() != firstBreak) {
4773                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4774                         __FILE__, __LINE__, lineNumber, firstBreak);
4775             }
4776             wordBreaks->setText(testString);
4777             wordBreaks->first();
4778             firstBreak = wordBreaks->next();
4779             if (testString.length() != firstBreak) {
4780                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4781                         __FILE__, __LINE__, lineNumber, firstBreak);
4782             }
4783             lineBreaks->setText(testString);
4784             lineBreaks->first();
4785             firstBreak = lineBreaks->next();
4786             if (testString.length() != firstBreak) {
4787                 errln("%s:%d  emoji-test.txt:%d Error, uexpected break at offset %d",
4788                         __FILE__, __LINE__, lineNumber, firstBreak);
4789             }
4790         }
4791     }
4792 }
4793
4794
4795 //
4796 //  TestDebug    -  A place-holder test for debugging purposes.
4797 //                  For putting in fragments of other tests that can be invoked
4798 //                  for tracing  without a lot of unwanted extra stuff happening.
4799 //
4800 void RBBITest::TestDebug(void) {
4801 }
4802
4803 void RBBITest::TestProperties() {
4804     UErrorCode errorCode = U_ZERO_ERROR;
4805     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4806     if (!prependSet.isEmpty()) {
4807         errln(
4808             "[:GCB=Prepend:] is not empty any more. "
4809             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4810             "change this test to the opposite condition.");
4811     }
4812 }
4813
4814 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */