icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2016, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "unicode/utypes.h"
  13 #if !UCONFIG_NO_BREAK_ITERATION
  14
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18
  19 #include "unicode/brkiter.h"
  20 #include "unicode/localpointer.h"
  21 #include "unicode/numfmt.h"
  22 #include "unicode/rbbi.h"
  23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  24 #include "unicode/regex.h"
  25 #endif
  26 #include "unicode/schriter.h"
  27 #include "unicode/uchar.h"
  28 #include "unicode/utf16.h"
  29 #include "unicode/ucnv.h"
  30 #include "unicode/uniset.h"
  31 #include "unicode/uscript.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/utext.h"
  34
  35 #include "charstr.h"
  36 #include "cmemory.h"
  37 #include "intltest.h"
  38 #include "rbbitst.h"
  39 #include "utypeinfo.h"  // for 'typeid' to work
  40 #include "uvector.h"
  41 #include "uvectr32.h"
  42
  43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  44 #include "unicode/filteredbrk.h"
  45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  46
  47 #define TEST_ASSERT(x) {if (!(x)) { \
  48     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  49
  50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  51     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  52
  53
  54 //---------------------------------------------
  55 // runIndexedTest
  56 //---------------------------------------------
  57
  58
  59 //  Note:  Before adding new tests to this file, check whether the desired test data can
  60 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  61 //         it's much less work than writing a new test, diagnostic output in the event of failures
  62 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  63 //         will run there as well, without additional effort.
  64
  65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  66 {
  67     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  68
  69     switch (index) {
  70 #if !UCONFIG_NO_FILE_IO
  71         case 0: name = "TestBug4153072";
  72             if(exec) TestBug4153072();                         break;
  73 #else
  74         case 0: name = "skip";
  75             break;
  76 #endif
  77
  78         case 1: name = "skip";
  79             break;
  80         case 2: name = "TestStatusReturn";
  81             if(exec) TestStatusReturn();                       break;
  82
  83 #if !UCONFIG_NO_FILE_IO
  84         case 3: name = "TestUnicodeFiles";
  85             if(exec) TestUnicodeFiles();                       break;
  86         case 4: name = "TestEmptyString";
  87             if(exec) TestEmptyString();                        break;
  88 #else
  89         case 3: case 4: name = "skip";
  90             break;
  91 #endif
  92
  93         case 5: name = "TestGetAvailableLocales";
  94             if(exec) TestGetAvailableLocales();                break;
  95
  96         case 6: name = "TestGetDisplayName";
  97             if(exec) TestGetDisplayName();                     break;
  98
  99 #if !UCONFIG_NO_FILE_IO
 100         case 7: name = "TestEndBehaviour";
 101             if(exec) TestEndBehaviour();                       break;
 102         case 8: case 9: case 10: name = "skip";
 103              break;
 104         case 11: name = "TestWordBreaks";
 105              if(exec) TestWordBreaks();                        break;
 106         case 12: name = "TestWordBoundary";
 107              if(exec) TestWordBoundary();                      break;
 108         case 13: name = "TestLineBreaks";
 109              if(exec) TestLineBreaks();                        break;
 110         case 14: name = "TestSentBreaks";
 111              if(exec) TestSentBreaks();                        break;
 112         case 15: name = "TestExtended";
 113              if(exec) TestExtended();                          break;
 114 #else
 115         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 116              break;
 117 #endif
 118
 119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 120         case 16:
 121             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
 122 #else
 123         case 16:
 124              name = "skip";                                    break;
 125 #endif
 126
 127 #if !UCONFIG_NO_FILE_IO
 128         case 17: name = "TestBug3818";
 129             if(exec) TestBug3818();                            break;
 130 #else
 131         case 17: name = "skip";
 132             break;
 133 #endif
 134
 135         case 18: name = "skip";
 136             break;
 137         case 19: name = "TestDebug";
 138             if(exec) TestDebug();                              break;
 139         case 20: name = "skip";
 140             break;
 141
 142 #if !UCONFIG_NO_FILE_IO
 143         case 21: name = "TestBug5775";
 144             if (exec) TestBug5775();                           break;
 145 #else
 146         case 21: name = "skip";
 147             break;
 148 #endif
 149
 150         case 22: name = "TestBug9983";
 151             if (exec) TestBug9983();                           break;
 152         case 23: name = "TestDictRules";
 153             if (exec) TestDictRules();                         break;
 154         case 24: name = "TestBug5532";
 155             if (exec) TestBug5532();                           break;
 156         default: name = ""; break; //needed to end loop
 157     }
 158 }
 159
 160
 161 //---------------------------------------------------------------------------
 162 //
 163 //   class BITestData   Holds a set of Break iterator test data and results
 164 //                      Includes
 165 //                         - the string data to be broken
 166 //                         - a vector of the expected break positions.
 167 //                         - a vector of source line numbers for the data,
 168 //                               (to help see where errors occured.)
 169 //                         - The expected break tag values.
 170 //                         - Vectors of actual break positions and tag values.
 171 //                         - Functions for comparing actual with expected and
 172 //                            reporting errors.
 173 //
 174 //----------------------------------------------------------------------------
 175 class BITestData {
 176 public:
 177     UnicodeString    fDataToBreak;
 178     UVector          fExpectedBreakPositions;
 179     UVector          fExpectedTags;
 180     UVector          fLineNum;
 181     UVector          fActualBreakPositions;   // Test Results.
 182     UVector          fActualTags;
 183
 184     BITestData(UErrorCode &status);
 185     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 186     void             checkResults(const char *heading, RBBITest *test);
 187     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 188     void             clearResults();
 189 };
 190
 191 //
 192 // Constructor.
 193 //
 194 BITestData::BITestData(UErrorCode &status)
 195 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 196   fActualTags(status)
 197 {
 198 }
 199
 200 //
 201 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 202 //                 The macro form collects the line number, which is helpful
 203 //                 when tracking down failures.
 204 //
 205 //                 A null data item is inserted at the start of each test's data
 206 //                  to put the starting zero into the data list.  The position saved for
 207 //                  each non-null item is its ending position.
 208 //
 209 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 211     if (U_FAILURE(status)) {return;}
 212     if (data != NULL) {
 213         fDataToBreak.append(CharsToUnicodeString(data));
 214     }
 215     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 216     fExpectedTags.addElement(tag, status);
 217     fLineNum.addElement(lineNum, status);
 218 }
 219
 220
 221 //
 222 //  checkResults.   Compare the actual and expected break positions, report any differences.
 223 //
 224 void BITestData::checkResults(const char *heading, RBBITest *test) {
 225     int32_t   expectedIndex = 0;
 226     int32_t   actualIndex = 0;
 227
 228     for (;;) {
 229         // If we've run through both the expected and actual results vectors, we're done.
 230         //   break out of the loop.
 231         if (expectedIndex >= fExpectedBreakPositions.size() &&
 232             actualIndex   >= fActualBreakPositions.size()) {
 233             break;
 234         }
 235
 236
 237         if (expectedIndex >= fExpectedBreakPositions.size()) {
 238             err(heading, test, expectedIndex-1, actualIndex);
 239             actualIndex++;
 240             continue;
 241         }
 242
 243         if (actualIndex >= fActualBreakPositions.size()) {
 244             err(heading, test, expectedIndex, actualIndex-1);
 245             expectedIndex++;
 246             continue;
 247         }
 248
 249         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 250             err(heading, test, expectedIndex, actualIndex);
 251             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 252             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 253                 actualIndex++;
 254             } else {
 255                 expectedIndex++;
 256             }
 257             continue;
 258         }
 259
 260         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 261             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 262                 heading, fLineNum.elementAt(expectedIndex),
 263                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 264         }
 265
 266         actualIndex++;
 267         expectedIndex++;
 268     }
 269 }
 270
 271 //
 272 //  err   -  An error was found.  Report it, along with information about where the
 273 //                                incorrectly broken test data appeared in the source file.
 274 //
 275 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 276 {
 277     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 278     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 279     int32_t   o        = 0;
 280     int32_t   line     = fLineNum.elementAti(expectedIdx);
 281     if (expectedIdx > 0) {
 282         // The line numbers are off by one because a premature break occurs somewhere
 283         //    within the previous item, rather than at the start of the current (expected) item.
 284         //    We want to report the offset of the unexpected break from the start of
 285         //      this previous item.
 286         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 287     }
 288     if (actual < expected) {
 289         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 290     } else {
 291         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 292     }
 293 }
 294
 295
 296 void BITestData::clearResults() {
 297     fActualBreakPositions.removeAllElements();
 298     fActualTags.removeAllElements();
 299 }
 300
 301
 302 //--------------------------------------------------------------------------------------
 303 //
 304 //    RBBITest    constructor and destructor
 305 //
 306 //--------------------------------------------------------------------------------------
 307
 308 RBBITest::RBBITest() {
 309 }
 310
 311
 312 RBBITest::~RBBITest() {
 313 }
 314
 315 //-----------------------------------------------------------------------------------
 316 //
 317 //   Test for status {tag} return value from break rules.
 318 //        TODO:  a more thorough test.
 319 //
 320 //-----------------------------------------------------------------------------------
 321 void RBBITest::TestStatusReturn() {
 322      UnicodeString rulesString1("$Letters = [:L:];\n"
 323                                   "$Numbers = [:N:];\n"
 324                                   "$Letters+{1};\n"
 325                                   "$Numbers+{2};\n"
 326                                   "Help\\ /me\\!{4};\n"
 327                                   "[^$Letters $Numbers];\n"
 328                                   "!.*;\n", -1, US_INV);
 329      UnicodeString testString1  = "abc123..abc Help me Help me!";
 330                                 // 01234567890123456789012345678
 331      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 332      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 333
 334      UErrorCode status=U_ZERO_ERROR;
 335      UParseError    parseError;
 336
 337      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
 338      if(U_FAILURE(status)) {
 339          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
 340          return;
 341      }
 342      int32_t  pos;
 343      int32_t  i = 0;
 344      bi->setText(testString1);
 345      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 346          if (pos != bounds1[i]) {
 347              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
 348              break;
 349          }
 350
 351          int tag = bi->getRuleStatus();
 352          if (tag != brkStatus[i]) {
 353              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
 354              break;
 355          }
 356          i++;
 357      }
 358 }
 359
 360
 361 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 362     UErrorCode status = U_ZERO_ERROR;
 363     char name[100];
 364     printf("code    alpha extend alphanum type word sent line name\n");
 365     int nextExpectedIndex = 0;
 366     utext_setNativeIndex(tstr, 0);
 367     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 368         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 369             printf("------------------------------------------------ %d\n", j);
 370             ++nextExpectedIndex;
 371         }
 372
 373         UChar32 c = utext_next32(tstr);
 374         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 375         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 376                            u_isUAlphabetic(c),
 377                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 378                            u_isalnum(c),
 379                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 380                                                   u_charType(c),
 381                                                   U_SHORT_PROPERTY_NAME),
 382                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 383                                                   u_getIntPropertyValue(c,
 384                                                           UCHAR_WORD_BREAK),
 385                                                   U_SHORT_PROPERTY_NAME),
 386                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 387                                    u_getIntPropertyValue(c,
 388                                            UCHAR_SENTENCE_BREAK),
 389                                    U_SHORT_PROPERTY_NAME),
 390                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 391                                    u_getIntPropertyValue(c,
 392                                            UCHAR_LINE_BREAK),
 393                                    U_SHORT_PROPERTY_NAME),
 394                            name);
 395     }
 396 }
 397
 398
 399 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 400    UErrorCode status = U_ZERO_ERROR;
 401    UText *tstr = NULL;
 402    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 403    if (U_FAILURE(status)) {
 404        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 405        return;
 406     }
 407    printStringBreaks(tstr, expected, expectedCount);
 408    utext_close(tstr);
 409 }
 410
 411
 412 void RBBITest::TestBug3818() {
 413     UErrorCode  status = U_ZERO_ERROR;
 414
 415     // Four Thai words...
 416     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 417                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 418     UnicodeString  thaiStr(thaiWordData);
 419
 420     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 421     if (U_FAILURE(status) || bi == NULL) {
 422         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 423         return;
 424     }
 425     bi->setText(thaiStr);
 426
 427     int32_t  startOfSecondWord = bi->following(1);
 428     if (startOfSecondWord != 4) {
 429         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 430             __FILE__, __LINE__, startOfSecondWord);
 431     }
 432     startOfSecondWord = bi->following(0);
 433     if (startOfSecondWord != 4) {
 434         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 435             __FILE__, __LINE__, startOfSecondWord);
 436     }
 437     delete bi;
 438 }
 439
 440 //----------------------------------------------------------------------------
 441 //
 442 // generalIteratorTest      Given a break iterator and a set of test data,
 443 //                          Run the tests and report the results.
 444 //
 445 //----------------------------------------------------------------------------
 446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 447 {
 448
 449     bi.setText(td.fDataToBreak);
 450
 451     testFirstAndNext(bi, td);
 452
 453     testLastAndPrevious(bi, td);
 454
 455     testFollowing(bi, td);
 456     testPreceding(bi, td);
 457     testIsBoundary(bi, td);
 458     doMultipleSelectionTest(bi, td);
 459 }
 460
 461
 462 //
 463 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 464 //                       kind of loop.
 465 //
 466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 467 {
 468     UErrorCode  status = U_ZERO_ERROR;
 469     int32_t     p;
 470     int32_t     lastP = -1;
 471     int32_t     tag;
 472
 473     logln("Test first and next");
 474     bi.setText(td.fDataToBreak);
 475     td.clearResults();
 476
 477     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 478         td.fActualBreakPositions.addElement(p, status);  // Save result.
 479         tag = bi.getRuleStatus();
 480         td.fActualTags.addElement(tag, status);
 481         if (p <= lastP) {
 482             // If the iterator is not making forward progress, stop.
 483             //  No need to raise an error here, it'll be detected in the normal check of results.
 484             break;
 485         }
 486         lastP = p;
 487     }
 488     td.checkResults("testFirstAndNext", this);
 489 }
 490
 491
 492 //
 493 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 494 //
 495 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 496 {
 497     UErrorCode  status = U_ZERO_ERROR;
 498     int32_t     p;
 499     int32_t     lastP  = 0x7ffffffe;
 500     int32_t     tag;
 501
 502     logln("Test last and previous");
 503     bi.setText(td.fDataToBreak);
 504     td.clearResults();
 505
 506     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 507         // Save break position.  Insert it at start of vector of results, shoving
 508         //    already-saved results further towards the end.
 509         td.fActualBreakPositions.insertElementAt(p, 0, status);
 510         // bi.previous();   // TODO:  Why does this fix things up????
 511         // bi.next();
 512         tag = bi.getRuleStatus();
 513         td.fActualTags.insertElementAt(tag, 0, status);
 514         if (p >= lastP) {
 515             // If the iterator is not making progress, stop.
 516             //  No need to raise an error here, it'll be detected in the normal check of results.
 517             break;
 518         }
 519         lastP = p;
 520     }
 521     td.checkResults("testLastAndPrevious", this);
 522 }
 523
 524
 525 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 526 {
 527     UErrorCode  status = U_ZERO_ERROR;
 528     int32_t     p;
 529     int32_t     tag;
 530     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 531                                  //   cannot be -1; that is returned for DONE.
 532     int         i;
 533
 534     logln("testFollowing():");
 535     bi.setText(td.fDataToBreak);
 536     td.clearResults();
 537
 538     // Save the starting point, since we won't get that out of following.
 539     p = bi.first();
 540     td.fActualBreakPositions.addElement(p, status);  // Save result.
 541     tag = bi.getRuleStatus();
 542     td.fActualTags.addElement(tag, status);
 543
 544     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 545         p = bi.following(i);
 546         if (p != lastP) {
 547             if (p == RuleBasedBreakIterator::DONE) {
 548                 break;
 549             }
 550             // We've reached a new break position.  Save it.
 551             td.fActualBreakPositions.addElement(p, status);  // Save result.
 552             tag = bi.getRuleStatus();
 553             td.fActualTags.addElement(tag, status);
 554             lastP = p;
 555         }
 556     }
 557     // The loop normally exits by means of the break in the middle.
 558     // Make sure that the index was at the correct position for the break iterator to have
 559     //   returned DONE.
 560     if (i != td.fDataToBreak.length()) {
 561         errln("testFollowing():  iterator returned DONE prematurely.");
 562     }
 563
 564     // Full check of all results.
 565     td.checkResults("testFollowing", this);
 566 }
 567
 568
 569
 570 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 571     UErrorCode  status = U_ZERO_ERROR;
 572     int32_t     p;
 573     int32_t     tag;
 574     int32_t     lastP  = 0x7ffffffe;
 575     int         i;
 576
 577     logln("testPreceding():");
 578     bi.setText(td.fDataToBreak);
 579     td.clearResults();
 580
 581     p = bi.last();
 582     td.fActualBreakPositions.addElement(p, status);
 583     tag = bi.getRuleStatus();
 584     td.fActualTags.addElement(tag, status);
 585
 586     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 587         p = bi.preceding(i);
 588         if (p != lastP) {
 589             if (p == RuleBasedBreakIterator::DONE) {
 590                 break;
 591             }
 592             // We've reached a new break position.  Save it.
 593             td.fActualBreakPositions.insertElementAt(p, 0, status);
 594             lastP = p;
 595             tag = bi.getRuleStatus();
 596             td.fActualTags.insertElementAt(tag, 0, status);
 597         }
 598     }
 599     // The loop normally exits by means of the break in the middle.
 600     // Make sure that the index was at the correct position for the break iterator to have
 601     //   returned DONE.
 602     if (i != 0) {
 603         errln("testPreceding():  iterator returned DONE prematurely.");
 604     }
 605
 606     // Full check of all results.
 607     td.checkResults("testPreceding", this);
 608 }
 609
 610
 611
 612 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 613     UErrorCode  status = U_ZERO_ERROR;
 614     int         i;
 615     int32_t     tag;
 616
 617     logln("testIsBoundary():");
 618     bi.setText(td.fDataToBreak);
 619     td.clearResults();
 620
 621     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 622         if (bi.isBoundary(i)) {
 623             td.fActualBreakPositions.addElement(i, status);  // Save result.
 624             tag = bi.getRuleStatus();
 625             td.fActualTags.addElement(tag, status);
 626         }
 627     }
 628     td.checkResults("testIsBoundary: ", this);
 629 }
 630
 631
 632
 633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 634 {
 635     iterator.setText(td.fDataToBreak);
 636
 637     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 638     int32_t offset = iterator.first();
 639     int32_t testOffset;
 640     int32_t count = 0;
 641
 642     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 643
 644     if (*testIterator != iterator)
 645         errln("clone() or operator!= failed: two clones compared unequal");
 646
 647     do {
 648         testOffset = testIterator->first();
 649         testOffset = testIterator->next(count);
 650         if (offset != testOffset)
 651             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 652
 653         if (offset != RuleBasedBreakIterator::DONE) {
 654             count++;
 655             offset = iterator.next();
 656
 657             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 658                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 659                 if (count > 10000 || offset == -1) {
 660                     errln("operator== failed too many times. Stopping test.");
 661                     if (offset == -1) {
 662                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 663                     }
 664                     return;
 665                 }
 666             }
 667         }
 668     } while (offset != RuleBasedBreakIterator::DONE);
 669
 670     // now do it backwards...
 671     offset = iterator.last();
 672     count = 0;
 673
 674     do {
 675         testOffset = testIterator->last();
 676         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 677         if (offset != testOffset)
 678             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 679
 680         if (offset != RuleBasedBreakIterator::DONE) {
 681             count--;
 682             offset = iterator.previous();
 683         }
 684     } while (offset != RuleBasedBreakIterator::DONE);
 685
 686     delete testIterator;
 687 }
 688
 689
 690 //---------------------------------------------
 691 //
 692 //     other tests
 693 //
 694 //---------------------------------------------
 695 void RBBITest::TestEmptyString()
 696 {
 697     UnicodeString text = "";
 698     UErrorCode status = U_ZERO_ERROR;
 699
 700     BITestData x(status);
 701     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 702     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 703     if (U_FAILURE(status))
 704     {
 705         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 706         return;
 707     }
 708     generalIteratorTest(*bi, x);
 709     delete bi;
 710 }
 711
 712 void RBBITest::TestGetAvailableLocales()
 713 {
 714     int32_t locCount = 0;
 715     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 716
 717     if (locCount == 0)
 718         dataerrln("getAvailableLocales() returned an empty list!");
 719     // Just make sure that it's returning good memory.
 720     int32_t i;
 721     for (i = 0; i < locCount; ++i) {
 722         logln(locList[i].getName());
 723     }
 724 }
 725
 726 //Testing the BreakIterator::getDisplayName() function
 727 void RBBITest::TestGetDisplayName()
 728 {
 729     UnicodeString   result;
 730
 731     BreakIterator::getDisplayName(Locale::getUS(), result);
 732     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 733         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 734                 + result);
 735
 736     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 737     if (result != "French (France)")
 738         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 739                 + result);
 740 }
 741 /**
 742  * Test End Behaviour
 743  * @bug 4068137
 744  */
 745 void RBBITest::TestEndBehaviour()
 746 {
 747     UErrorCode status = U_ZERO_ERROR;
 748     UnicodeString testString("boo.");
 749     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 750     if (U_FAILURE(status))
 751     {
 752         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 753         return;
 754     }
 755     wb->setText(testString);
 756
 757     if (wb->first() != 0)
 758         errln("Didn't get break at beginning of string.");
 759     if (wb->next() != 3)
 760         errln("Didn't get break before period in \"boo.\"");
 761     if (wb->current() != 4 && wb->next() != 4)
 762         errln("Didn't get break at end of string.");
 763     delete wb;
 764 }
 765 /*
 766  * @bug 4153072
 767  */
 768 void RBBITest::TestBug4153072() {
 769     UErrorCode status = U_ZERO_ERROR;
 770     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 771     if (U_FAILURE(status))
 772     {
 773         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 774         return;
 775     }
 776     UnicodeString str("...Hello, World!...");
 777     int32_t begin = 3;
 778     int32_t end = str.length() - 3;
 779     UBool onBoundary;
 780
 781     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 782     iter->adoptText(textIterator);
 783     int index;
 784     // Note: with the switch to UText, there is no way to restrict the
 785     //       iteration range to begin at an index other than zero.
 786     //       String character iterators created with a non-zero bound are
 787     //         treated by RBBI as being empty.
 788     for (index = -1; index < begin + 1; ++index) {
 789         onBoundary = iter->isBoundary(index);
 790         if (index == 0?  !onBoundary : onBoundary) {
 791             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 792                             " and begin index = " + begin);
 793         }
 794     }
 795     delete iter;
 796 }
 797
 798
 799 //
 800 // Test for problem reported by Ashok Matoria on 9 July 2007
 801 //    One.<kSoftHyphen><kSpace>Two.
 802 //
 803 //    Sentence break at start (0) and then on calling next() it breaks at
 804 //   'T' of "Two". Now, at this point if I do next() and
 805 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 806 //
 807 void RBBITest::TestBug5775() {
 808     UErrorCode status = U_ZERO_ERROR;
 809     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 810     TEST_ASSERT_SUCCESS(status);
 811     if (U_FAILURE(status)) {
 812         return;
 813     }
 814 // Check for status first for better handling of no data errors.
 815     TEST_ASSERT(bi != NULL);
 816     if (bi == NULL) {
 817         return;
 818     }
 819
 820     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 821     //               01234      56789
 822     s = s.unescape();
 823     bi->setText(s);
 824     int pos = bi->next();
 825     TEST_ASSERT(pos == 6);
 826     pos = bi->next();
 827     TEST_ASSERT(pos == 10);
 828     pos = bi->previous();
 829     TEST_ASSERT(pos == 6);
 830     delete bi;
 831 }
 832
 833
 834
 835 //------------------------------------------------------------------------------
 836 //
 837 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 838 //
 839 //------------------------------------------------------------------------------
 840
 841 struct TestParams {
 842     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 843                                            //   Changed out whenever test data changes break type.
 844
 845     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 846     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 847     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 848     UVector32       *srcCol;
 849
 850     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 851     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 852     CharString       utf8String;           // UTF-8 form of text to break.
 853
 854     TestParams(UErrorCode &status) : dataToBreak() {
 855         bi               = NULL;
 856         expectedBreaks   = new UVector32(status);
 857         srcLine          = new UVector32(status);
 858         srcCol           = new UVector32(status);
 859         textToBreak      = NULL;
 860         textMap          = new UVector32(status);
 861     }
 862
 863     ~TestParams() {
 864         delete bi;
 865         delete expectedBreaks;
 866         delete srcLine;
 867         delete srcCol;
 868         utext_close(textToBreak);
 869         delete textMap;
 870     }
 871
 872     int32_t getSrcLine(int32_t bp);
 873     int32_t getExpectedBreak(int32_t bp);
 874     int32_t getSrcCol(int32_t bp);
 875
 876     void setUTF16(UErrorCode &status);
 877     void setUTF8(UErrorCode &status);
 878 };
 879
 880 // Append a UnicodeString to a CharString with UTF-8 encoding.
 881 // Substitute any invalid chars.
 882 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 883 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 884     if (U_FAILURE(status)) {
 885         return;
 886     }
 887     int32_t utf8Length;
 888     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 889                        src.getBuffer(), src.length(),   // UTF-16 data
 890                        0xfffd, NULL,                    // Substitution char, number of subs.
 891                        &status);
 892     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 893         return;
 894     }
 895     status = U_ZERO_ERROR;
 896     int32_t capacity;
 897     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 898     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 899                        src.getBuffer(), src.length(),
 900                        0xfffd, NULL, &status);
 901     dest.append(buffer, utf8Length, status);
 902 }
 903
 904
 905 void TestParams::setUTF16(UErrorCode &status) {
 906     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 907     textMap->removeAllElements();
 908     for (int32_t i=0; i<dataToBreak.length(); i++) {
 909         if (i == dataToBreak.getChar32Start(i)) {
 910             textMap->addElement(i, status);
 911         } else {
 912             textMap->addElement(-1, status);
 913         }
 914     }
 915     textMap->addElement(dataToBreak.length(), status);
 916     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 917 }
 918
 919
 920 void TestParams::setUTF8(UErrorCode &status) {
 921     if (U_FAILURE(status)) {
 922         return;
 923     }
 924     utf8String.clear();
 925     CharStringAppend(utf8String, dataToBreak, status);
 926     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 927     if (U_FAILURE(status)) {
 928         return;
 929     }
 930
 931     textMap->removeAllElements();
 932     int32_t utf16Index = 0;
 933     for (;;) {
 934         textMap->addElement(utf16Index, status);
 935         UChar32 c32 = utext_current32(textToBreak);
 936         if (c32 < 0) {
 937             break;
 938         }
 939         utf16Index += U16_LENGTH(c32);
 940         utext_next32(textToBreak);
 941         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 942             textMap->addElement(-1, status);
 943         }
 944     }
 945     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 946 }
 947
 948
 949 int32_t TestParams::getSrcLine(int bp) {
 950     if (bp >= textMap->size()) {
 951         bp = textMap->size() - 1;
 952     }
 953     int32_t i = 0;
 954     for(; bp >= 0 ; --bp) {
 955         // Move to a character boundary if we are not on one already.
 956         i = textMap->elementAti(bp);
 957         if (i >= 0) {
 958             break;
 959         }
 960     }
 961     return srcLine->elementAti(i);
 962 }
 963
 964
 965 int32_t TestParams::getExpectedBreak(int bp) {
 966     if (bp >= textMap->size()) {
 967         return 0;
 968     }
 969     int32_t i = textMap->elementAti(bp);
 970     int32_t retVal = 0;
 971     if (i >= 0) {
 972         retVal = expectedBreaks->elementAti(i);
 973     }
 974     return retVal;
 975 }
 976
 977
 978 int32_t TestParams::getSrcCol(int bp) {
 979     if (bp >= textMap->size()) {
 980         bp = textMap->size() - 1;
 981     }
 982     int32_t i = 0;
 983     for(; bp >= 0; --bp) {
 984         // Move bp to a character boundary if we are not on one already.
 985         i = textMap->elementAti(bp);
 986         if (i >= 0) {
 987             break;
 988         }
 989     }
 990     return srcCol->elementAti(i);
 991 }
 992
 993
 994 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 995     int32_t    bp;
 996     int32_t    prevBP;
 997     int32_t    i;
 998
 999     TEST_ASSERT_SUCCESS(status);
1000     if (U_FAILURE(status)) {
1001         return;
1002     }
1003
1004     if (t->bi == NULL) {
1005         return;
1006     }
1007
1008     t->bi->setText(t->textToBreak, status);
1009     //
1010     //  Run the iterator forward
1011     //
1012     prevBP = -1;
1013     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1014         if (prevBP ==  bp) {
1015             // Fail for lack of forward progress.
1016             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1017                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1018             break;
1019         }
1020
1021         // Check that there we didn't miss an expected break between the last one
1022         //  and this one.
1023         for (i=prevBP+1; i<bp; i++) {
1024             if (t->getExpectedBreak(i) != 0) {
1025                 int expected[] = {0, i};
1026                 printStringBreaks(t->dataToBreak, expected, 2);
1027                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1028                       i, t->getSrcLine(i), t->getSrcCol(i));
1029             }
1030         }
1031
1032         // Check that the break we did find was expected
1033         if (t->getExpectedBreak(bp) == 0) {
1034             int expected[] = {0, bp};
1035             printStringBreaks(t->textToBreak, expected, 2);
1036             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1037                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1038         } else {
1039             // The break was expected.
1040             //   Check that the {nnn} tag value is correct.
1041             int32_t expectedTagVal = t->getExpectedBreak(bp);
1042             if (expectedTagVal == -1) {
1043                 expectedTagVal = 0;
1044             }
1045             int32_t line = t->getSrcLine(bp);
1046             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1047             if (rs != expectedTagVal) {
1048                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1049                       "          Actual, Expected status = %4d, %4d",
1050                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1051             }
1052         }
1053
1054         prevBP = bp;
1055     }
1056
1057     // Verify that there were no missed expected breaks after the last one found
1058     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1059         if (t->getExpectedBreak(i) != 0) {
1060             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1061                       i, t->getSrcLine(i), t->getSrcCol(i));
1062         }
1063     }
1064
1065     //
1066     //  Run the iterator backwards, verify that the same breaks are found.
1067     //
1068     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1069     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1070         if (prevBP ==  bp) {
1071             // Fail for lack of progress.
1072             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1073                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1074             break;
1075         }
1076
1077         // Check that we didn't miss an expected break between the last one
1078         //  and this one.  (UVector returns zeros for index out of bounds.)
1079         for (i=prevBP-1; i>bp; i--) {
1080             if (t->getExpectedBreak(i) != 0) {
1081                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1082                       i, t->getSrcLine(i), t->getSrcCol(i));
1083             }
1084         }
1085
1086         // Check that the break we did find was expected
1087         if (t->getExpectedBreak(bp) == 0) {
1088             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1089                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1090         } else {
1091             // The break was expected.
1092             //   Check that the {nnn} tag value is correct.
1093             int32_t expectedTagVal = t->getExpectedBreak(bp);
1094             if (expectedTagVal == -1) {
1095                 expectedTagVal = 0;
1096             }
1097             int line = t->getSrcLine(bp);
1098             int32_t rs = t->bi->getRuleStatus();
1099             if (rs != expectedTagVal) {
1100                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1101                       "          Actual, Expected status = %4d, %4d",
1102                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1103             }
1104         }
1105
1106         prevBP = bp;
1107     }
1108
1109     // Verify that there were no missed breaks prior to the last one found
1110     for (i=prevBP-1; i>=0; i--) {
1111         if (t->getExpectedBreak(i) != 0) {
1112             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1113                       i, t->getSrcLine(i), t->getSrcCol(i));
1114         }
1115     }
1116
1117     // Check isBoundary()
1118     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1119         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1120         UBool boundaryFound    = t->bi->isBoundary(i);
1121         if (boundaryExpected != boundaryFound) {
1122             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123                   "        Expected, Actual= %s, %s",
1124                   i, t->getSrcLine(i), t->getSrcCol(i),
1125                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1126         }
1127     }
1128
1129     // Check following()
1130     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1131         int32_t actualBreak = t->bi->following(i);
1132         int32_t expectedBreak = BreakIterator::DONE;
1133         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1134             if (t->getExpectedBreak(j) != 0) {
1135                 expectedBreak = j;
1136                 break;
1137             }
1138         }
1139         if (expectedBreak != actualBreak) {
1140             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141                   "        Expected, Actual= %d, %d",
1142                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1143         }
1144     }
1145
1146     // Check preceding()
1147     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1148         int32_t actualBreak = t->bi->preceding(i);
1149         int32_t expectedBreak = BreakIterator::DONE;
1150
1151         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152         // preceding(trailing byte) will return the index of some preceding code point,
1153         // not the lead byte of the current code point, even though that has a smaller index.
1154         // Therefore, start looking at the expected break data not at i-1, but at
1155         // the start of code point index - 1.
1156         utext_setNativeIndex(t->textToBreak, i);
1157         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1158         for (; j >= 0; j--) {
1159             if (t->getExpectedBreak(j) != 0) {
1160                 expectedBreak = j;
1161                 break;
1162             }
1163         }
1164         if (expectedBreak != actualBreak) {
1165             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166                   "        Expected, Actual= %d, %d",
1167                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1168         }
1169     }
1170 }
1171
1172
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175     UErrorCode      status  = U_ZERO_ERROR;
1176     Locale          locale("");
1177
1178     UnicodeString       rules;
1179     TestParams          tp(status);
1180
1181     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1182     if (U_FAILURE(status)) {
1183         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1184     }
1185
1186
1187     //
1188     //  Open and read the test data file.
1189     //
1190     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1191     char testFileName[1000];
1192     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1193         errln("Can't open test data.  Path too long.");
1194         return;
1195     }
1196     strcpy(testFileName, testDataDirectory);
1197     strcat(testFileName, "rbbitst.txt");
1198
1199     int    len;
1200     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1201     if (U_FAILURE(status)) {
1202         return; /* something went wrong, error already output */
1203     }
1204
1205
1206     bool skipTest = false; // Skip this test?
1207
1208     //
1209     //  Put the test data into a UnicodeString
1210     //
1211     UnicodeString testString(FALSE, testFile, len);
1212
1213     enum EParseState{
1214         PARSE_COMMENT,
1215         PARSE_TAG,
1216         PARSE_DATA,
1217         PARSE_NUM
1218     }
1219     parseState = PARSE_TAG;
1220
1221     EParseState savedState = PARSE_TAG;
1222
1223     static const UChar CH_LF        = 0x0a;
1224     static const UChar CH_CR        = 0x0d;
1225     static const UChar CH_HASH      = 0x23;
1226     /*static const UChar CH_PERIOD    = 0x2e;*/
1227     static const UChar CH_LT        = 0x3c;
1228     static const UChar CH_GT        = 0x3e;
1229     static const UChar CH_BACKSLASH = 0x5c;
1230     static const UChar CH_BULLET    = 0x2022;
1231
1232     int32_t    lineNum  = 1;
1233     int32_t    colStart = 0;
1234     int32_t    column   = 0;
1235     int32_t    charIdx  = 0;
1236
1237     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1238
1239     for (charIdx = 0; charIdx < len; ) {
1240         status = U_ZERO_ERROR;
1241         UChar  c = testString.charAt(charIdx);
1242         charIdx++;
1243         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1244             // treat CRLF as a unit
1245             c = CH_LF;
1246             charIdx++;
1247         }
1248         if (c == CH_LF || c == CH_CR) {
1249             lineNum++;
1250             colStart = charIdx;
1251         }
1252         column = charIdx - colStart + 1;
1253
1254         switch (parseState) {
1255         case PARSE_COMMENT:
1256             if (c == 0x0a || c == 0x0d) {
1257                 parseState = savedState;
1258             }
1259             break;
1260
1261         case PARSE_TAG:
1262             {
1263             if (c == CH_HASH) {
1264                 parseState = PARSE_COMMENT;
1265                 savedState = PARSE_TAG;
1266                 break;
1267             }
1268             if (u_isUWhiteSpace(c)) {
1269                 break;
1270             }
1271             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1272                 delete tp.bi;
1273                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1274                 skipTest = false;
1275                 charIdx += 5;
1276                 break;
1277             }
1278             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1279                 delete tp.bi;
1280                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1281                 skipTest = false;
1282                 charIdx += 5;
1283                 break;
1284             }
1285             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1286                 delete tp.bi;
1287                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1288                 skipTest = false;
1289                 charIdx += 5;
1290                 break;
1291             }
1292             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1293                 delete tp.bi;
1294                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1295                 skipTest = false;
1296                 charIdx += 5;
1297                 break;
1298             }
1299             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1300                 delete tp.bi;
1301                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1302                 charIdx += 6;
1303                 break;
1304             }
1305
1306             // <locale  loc_name>
1307             localeMatcher.reset(testString);
1308             if (localeMatcher.lookingAt(charIdx-1, status)) {
1309                 UnicodeString localeName = localeMatcher.group(1, status);
1310                 char localeName8[100];
1311                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1312                 locale = Locale::createFromName(localeName8);
1313                 charIdx += localeMatcher.group(0, status).length() - 1;
1314                 TEST_ASSERT_SUCCESS(status);
1315                 break;
1316             }
1317             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1318                 parseState = PARSE_DATA;
1319                 charIdx += 5;
1320                 tp.dataToBreak = "";
1321                 tp.expectedBreaks->removeAllElements();
1322                 tp.srcCol ->removeAllElements();
1323                 tp.srcLine->removeAllElements();
1324                 break;
1325             }
1326
1327             errln("line %d: Tag expected in test file.", lineNum);
1328             parseState = PARSE_COMMENT;
1329             savedState = PARSE_DATA;
1330             goto end_test; // Stop the test.
1331             }
1332             break;
1333
1334         case PARSE_DATA:
1335             if (c == CH_BULLET) {
1336                 int32_t  breakIdx = tp.dataToBreak.length();
1337                 tp.expectedBreaks->setSize(breakIdx+1);
1338                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1339                 tp.srcLine->setSize(breakIdx+1);
1340                 tp.srcLine->setElementAt(lineNum, breakIdx);
1341                 tp.srcCol ->setSize(breakIdx+1);
1342                 tp.srcCol ->setElementAt(column, breakIdx);
1343                 break;
1344             }
1345
1346             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1347                 // Add final entry to mappings from break location to source file position.
1348                 //  Need one extra because last break position returned is after the
1349                 //    last char in the data, not at the last char.
1350                 tp.srcLine->addElement(lineNum, status);
1351                 tp.srcCol ->addElement(column, status);
1352
1353                 parseState = PARSE_TAG;
1354                 charIdx += 6;
1355
1356                 if (!skipTest) {
1357                     // RUN THE TEST!
1358                     status = U_ZERO_ERROR;
1359                     tp.setUTF16(status);
1360                     executeTest(&tp, status);
1361                     TEST_ASSERT_SUCCESS(status);
1362
1363                     // Run again, this time with UTF-8 text wrapped in a UText.
1364                     status = U_ZERO_ERROR;
1365                     tp.setUTF8(status);
1366                     TEST_ASSERT_SUCCESS(status);
1367                     executeTest(&tp, status);
1368                 }
1369                 break;
1370             }
1371
1372             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374                 // Get the code point from the name and insert it into the test data.
1375                 //   (Damn, no API takes names in Unicode  !!!
1376                 //    we've got to take it back to char *)
1377                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1378                 int32_t nameLength = nameEndIdx - (charIdx+2);
1379                 char charNameBuf[200];
1380                 UChar32 theChar = -1;
1381                 if (nameEndIdx != -1) {
1382                     UErrorCode status = U_ZERO_ERROR;
1383                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1384                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1385                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1386                     if (U_FAILURE(status)) {
1387                         theChar = -1;
1388                     }
1389                 }
1390                 if (theChar == -1) {
1391                     errln("Error in named character in test file at line %d, col %d",
1392                         lineNum, column);
1393                 } else {
1394                     // Named code point was recognized.  Insert it
1395                     //   into the test data.
1396                     tp.dataToBreak.append(theChar);
1397                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1398                         tp.srcLine->addElement(lineNum, status);
1399                         tp.srcCol ->addElement(column, status);
1400                     }
1401                 }
1402                 if (nameEndIdx > charIdx) {
1403                     charIdx = nameEndIdx+1;
1404
1405                 }
1406                 break;
1407             }
1408
1409
1410
1411
1412             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1413                 charIdx++;
1414                 int32_t  breakIdx = tp.dataToBreak.length();
1415                 tp.expectedBreaks->setSize(breakIdx+1);
1416                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1417                 tp.srcLine->setSize(breakIdx+1);
1418                 tp.srcLine->setElementAt(lineNum, breakIdx);
1419                 tp.srcCol ->setSize(breakIdx+1);
1420                 tp.srcCol ->setElementAt(column, breakIdx);
1421                 break;
1422             }
1423
1424             if (c == CH_LT) {
1425                 tagValue   = 0;
1426                 parseState = PARSE_NUM;
1427                 break;
1428             }
1429
1430             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1431                 parseState = PARSE_COMMENT;
1432                 savedState = PARSE_DATA;
1433                 break;
1434             }
1435
1436             if (c == CH_BACKSLASH) {
1437                 // Check for \ at end of line, a line continuation.
1438                 //     Advance over (discard) the newline
1439                 UChar32 cp = testString.char32At(charIdx);
1440                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1441                     // We have a CR LF
1442                     //  Need an extra increment of the input ptr to move over both of them
1443                     charIdx++;
1444                 }
1445                 if (cp == CH_LF || cp == CH_CR) {
1446                     lineNum++;
1447                     colStart = charIdx;
1448                     charIdx++;
1449                     break;
1450                 }
1451
1452                 // Let unescape handle the back slash.
1453                 cp = testString.unescapeAt(charIdx);
1454                 if (cp != -1) {
1455                     // Escape sequence was recognized.  Insert the char
1456                     //   into the test data.
1457                     tp.dataToBreak.append(cp);
1458                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1459                         tp.srcLine->addElement(lineNum, status);
1460                         tp.srcCol ->addElement(column, status);
1461                     }
1462                     break;
1463                 }
1464
1465
1466                 // Not a recognized backslash escape sequence.
1467                 // Take the next char as a literal.
1468                 //  TODO:  Should this be an error?
1469                 c = testString.charAt(charIdx);
1470                 charIdx = testString.moveIndex32(charIdx, 1);
1471             }
1472
1473             // Normal, non-escaped data char.
1474             tp.dataToBreak.append(c);
1475
1476             // Save the mapping from offset in the data to line/column numbers in
1477             //   the original input file.  Will be used for better error messages only.
1478             //   If there's an expected break before this char, the slot in the mapping
1479             //     vector will already be set for this char; don't overwrite it.
1480             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1481                 tp.srcLine->addElement(lineNum, status);
1482                 tp.srcCol ->addElement(column, status);
1483             }
1484             break;
1485
1486
1487         case PARSE_NUM:
1488             // We are parsing an expected numeric tag value, like <1234>,
1489             //   within a chunk of data.
1490             if (u_isUWhiteSpace(c)) {
1491                 break;
1492             }
1493
1494             if (c == CH_GT) {
1495                 // Finished the number.  Add the info to the expected break data,
1496                 //   and switch parse state back to doing plain data.
1497                 parseState = PARSE_DATA;
1498                 if (tagValue == 0) {
1499                     tagValue = -1;
1500                 }
1501                 int32_t  breakIdx = tp.dataToBreak.length();
1502                 tp.expectedBreaks->setSize(breakIdx+1);
1503                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1504                 tp.srcLine->setSize(breakIdx+1);
1505                 tp.srcLine->setElementAt(lineNum, breakIdx);
1506                 tp.srcCol ->setSize(breakIdx+1);
1507                 tp.srcCol ->setElementAt(column, breakIdx);
1508                 break;
1509             }
1510
1511             if (u_isdigit(c)) {
1512                 tagValue = tagValue*10 + u_charDigitValue(c);
1513                 break;
1514             }
1515
1516             errln("Syntax Error in test file at line %d, col %d",
1517                 lineNum, column);
1518             parseState = PARSE_COMMENT;
1519             goto end_test; // Stop the test
1520             break;
1521         }
1522
1523
1524         if (U_FAILURE(status)) {
1525             dataerrln("ICU Error %s while parsing test file at line %d.",
1526                 u_errorName(status), lineNum);
1527             status = U_ZERO_ERROR;
1528             goto end_test; // Stop the test
1529         }
1530
1531     }
1532
1533 end_test:
1534     delete [] testFile;
1535 #endif
1536 }
1537
1538
1539 //-------------------------------------------------------------------------------
1540 //
1541 //  TestDictRules   create a break iterator from source rules that includes a
1542 //                  dictionary range.   Regression for bug #7130.  Source rules
1543 //                  do not declare a break iterator type (word, line, sentence, etc.
1544 //                  but the dictionary code, without a type, would loop.
1545 //
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548     const char *rules =  "$dictionary = [a-z]; \n"
1549                          "!!forward; \n"
1550                          "$dictionary $dictionary; \n"
1551                          "!!reverse; \n"
1552                          "$dictionary $dictionary; \n";
1553     const char *text = "aa";
1554     UErrorCode status = U_ZERO_ERROR;
1555     UParseError parseError;
1556
1557     RuleBasedBreakIterator bi(rules, parseError, status);
1558     if (U_SUCCESS(status)) {
1559         UnicodeString utext = text;
1560         bi.setText(utext);
1561         int32_t position;
1562         int32_t loops;
1563         for (loops = 0; loops<10; loops++) {
1564             position = bi.next();
1565             if (position == RuleBasedBreakIterator::DONE) {
1566                 break;
1567             }
1568         }
1569         TEST_ASSERT(loops == 1);
1570     } else {
1571         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1572     }
1573 }
1574
1575
1576
1577 //-------------------------------------------------------------------------------
1578 //
1579 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1580 //    return the data in one big UChar * buffer, which the caller must delete.
1581 //
1582 //    parameters:
1583 //          fileName:   the name of the file, with no directory part.  The test data directory
1584 //                      is assumed.
1585 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1586 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1587 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1588 //                      Pass NULL for the system default encoding.
1589 //          status
1590 //    returns:
1591 //                      The file data, converted to UChar.
1592 //                      The caller must delete this when done with
1593 //                           delete [] theBuffer;
1594 //
1595 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1596 //           Move this function to some common place.
1597 //
1598 //--------------------------------------------------------------------------------
1599 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1600     UChar       *retPtr  = NULL;
1601     char        *fileBuf = NULL;
1602     UConverter* conv     = NULL;
1603     FILE        *f       = NULL;
1604
1605     ulen = 0;
1606     if (U_FAILURE(status)) {
1607         return retPtr;
1608     }
1609
1610     //
1611     //  Open the file.
1612     //
1613     f = fopen(fileName, "rb");
1614     if (f == 0) {
1615         dataerrln("Error opening test data file %s\n", fileName);
1616         status = U_FILE_ACCESS_ERROR;
1617         return NULL;
1618     }
1619     //
1620     //  Read it in
1621     //
1622     int   fileSize;
1623     int   amt_read;
1624
1625     fseek( f, 0, SEEK_END);
1626     fileSize = ftell(f);
1627     fileBuf = new char[fileSize];
1628     fseek(f, 0, SEEK_SET);
1629     amt_read = fread(fileBuf, 1, fileSize, f);
1630     if (amt_read != fileSize || fileSize <= 0) {
1631         errln("Error reading test data file.");
1632         goto cleanUpAndReturn;
1633     }
1634
1635     //
1636     // Look for a Unicode Signature (BOM) on the data just read
1637     //
1638     int32_t        signatureLength;
1639     const char *   fileBufC;
1640     const char*    bomEncoding;
1641
1642     fileBufC = fileBuf;
1643     bomEncoding = ucnv_detectUnicodeSignature(
1644         fileBuf, fileSize, &signatureLength, &status);
1645     if(bomEncoding!=NULL ){
1646         fileBufC  += signatureLength;
1647         fileSize  -= signatureLength;
1648         encoding = bomEncoding;
1649     }
1650
1651     //
1652     // Open a converter to take the rule file to UTF-16
1653     //
1654     conv = ucnv_open(encoding, &status);
1655     if (U_FAILURE(status)) {
1656         goto cleanUpAndReturn;
1657     }
1658
1659     //
1660     // Convert the rules to UChar.
1661     //  Preflight first to determine required buffer size.
1662     //
1663     ulen = ucnv_toUChars(conv,
1664         NULL,           //  dest,
1665         0,              //  destCapacity,
1666         fileBufC,
1667         fileSize,
1668         &status);
1669     if (status == U_BUFFER_OVERFLOW_ERROR) {
1670         // Buffer Overflow is expected from the preflight operation.
1671         status = U_ZERO_ERROR;
1672
1673         retPtr = new UChar[ulen+1];
1674         ucnv_toUChars(conv,
1675             retPtr,       //  dest,
1676             ulen+1,
1677             fileBufC,
1678             fileSize,
1679             &status);
1680     }
1681
1682 cleanUpAndReturn:
1683     fclose(f);
1684     delete []fileBuf;
1685     ucnv_close(conv);
1686     if (U_FAILURE(status)) {
1687         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1688         delete []retPtr;
1689         retPtr = 0;
1690         ulen   = 0;
1691     };
1692     return retPtr;
1693 }
1694
1695
1696
1697 //--------------------------------------------------------------------------------------------
1698 //
1699 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1700 //
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703     RuleBasedBreakIterator  *bi;
1704     UErrorCode               status = U_ZERO_ERROR;
1705
1706     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1707     TEST_ASSERT_SUCCESS(status);
1708     if (U_SUCCESS(status)) {
1709         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1710     }
1711     delete bi;
1712
1713     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1714     TEST_ASSERT_SUCCESS(status);
1715     if (U_SUCCESS(status)) {
1716         runUnicodeTestData("WordBreakTest.txt", bi);
1717     }
1718     delete bi;
1719
1720     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1721     TEST_ASSERT_SUCCESS(status);
1722     if (U_SUCCESS(status)) {
1723         runUnicodeTestData("SentenceBreakTest.txt", bi);
1724     }
1725     delete bi;
1726
1727     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1728     TEST_ASSERT_SUCCESS(status);
1729     if (U_SUCCESS(status)) {
1730         runUnicodeTestData("LineBreakTest.txt", bi);
1731     }
1732     delete bi;
1733 }
1734
1735
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1739
1740 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1741     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1742         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1743         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1744         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1745         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1746         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1747         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1748     };
1749     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1750         return FALSE;
1751     }
1752
1753     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1754         if (testCase == UnicodeString(badTestCases[i])) {
1755             return logKnownIssue("7270");
1756         }
1757     }
1758     return FALSE;
1759 }
1760
1761
1762 //--------------------------------------------------------------------------------------------
1763 //
1764 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1765 //
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769     UErrorCode  status = U_ZERO_ERROR;
1770
1771     //
1772     //  Open and read the test data file, put it into a UnicodeString.
1773     //
1774     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1775     char testFileName[1000];
1776     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1777         dataerrln("Can't open test data.  Path too long.");
1778         return;
1779     }
1780     strcpy(testFileName, testDataDirectory);
1781     strcat(testFileName, fileName);
1782
1783     logln("Opening data file %s\n", fileName);
1784
1785     int    len;
1786     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1787     if (status != U_FILE_ACCESS_ERROR) {
1788         TEST_ASSERT_SUCCESS(status);
1789         TEST_ASSERT(testFile != NULL);
1790     }
1791     if (U_FAILURE(status) || testFile == NULL) {
1792         return; /* something went wrong, error already output */
1793     }
1794     UnicodeString testFileAsString(TRUE, testFile, len);
1795
1796     //
1797     //  Parse the test data file using a regular expression.
1798     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1799     //     is identified by which group had a match.
1800     //
1801     //    Caputure Group #                  1          2            3            4           5
1802     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1803     //
1804     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1805     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1806     UnicodeString   testString;
1807     UVector32       breakPositions(status);
1808     int             lineNumber = 1;
1809     TEST_ASSERT_SUCCESS(status);
1810     if (U_FAILURE(status)) {
1811         return;
1812     }
1813
1814     //
1815     //  Scan through each test case, building up the string to be broken in testString,
1816     //   and the positions that should be boundaries in the breakPositions vector.
1817     //
1818     int spin = 0;
1819     while (tokenMatcher.find()) {
1820         if(tokenMatcher.hitEnd()) {
1821           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1822              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823              and caused an infinite loop here on EBCDIC systems!
1824           */
1825           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1826           //       return;
1827         }
1828         if (tokenMatcher.start(1, status) >= 0) {
1829             // Scanned a divide sign, indicating a break position in the test data.
1830             if (testString.length()>0) {
1831                 breakPositions.addElement(testString.length(), status);
1832             }
1833         }
1834         else if (tokenMatcher.start(2, status) >= 0) {
1835             // Scanned an 'x', meaning no break at this position in the test data
1836             //   Nothing to be done here.
1837             }
1838         else if (tokenMatcher.start(3, status) >= 0) {
1839             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1840             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1841             int length = hexNumber.length();
1842             if (length<=8) {
1843                 char buf[10];
1844                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1845                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1846                 if (c<=0x10ffff) {
1847                     testString.append(c);
1848                 } else {
1849                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850                        fileName, lineNumber);
1851                 }
1852             } else {
1853                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854                        fileName, lineNumber);
1855              }
1856         }
1857         else if (tokenMatcher.start(4, status) >= 0) {
1858             // Scanned to end of a line, possibly skipping over a comment in the process.
1859             //   If the line from the file contained test data, run the test now.
1860             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1861                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1862             }
1863
1864             // Clear out this test case.
1865             //    The string and breakPositions vector will be refilled as the next
1866             //       test case is parsed.
1867             testString.remove();
1868             breakPositions.removeAllElements();
1869             lineNumber++;
1870         } else {
1871             // Scanner catchall.  Something unrecognized appeared on the line.
1872             char token[16];
1873             UnicodeString uToken = tokenMatcher.group(0, status);
1874             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1875             token[sizeof(token)-1] = 0;
1876             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1877
1878             // Clean up, in preparation for continuing with the next line.
1879             testString.remove();
1880             breakPositions.removeAllElements();
1881             lineNumber++;
1882         }
1883         TEST_ASSERT_SUCCESS(status);
1884         if (U_FAILURE(status)) {
1885             break;
1886         }
1887     }
1888
1889     delete [] testFile;
1890  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1891 }
1892
1893 //--------------------------------------------------------------------------------------------
1894 //
1895 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1896 //                            test data files.  Do only a simple, forward-only check -
1897 //                            this test is mostly to check that ICU and the Unicode
1898 //                            data agree with each other.
1899 //
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1902                          const UnicodeString &testString,   // Text data to be broken
1903                          UVector32 *breakPositions,         // Positions where breaks should be found.
1904                          RuleBasedBreakIterator *bi) {
1905     int32_t pos;                 // Break Position in the test string
1906     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1907     int32_t expectedPos;         // Expected break position (index into test string)
1908
1909     bi->setText(testString);
1910     pos = bi->first();
1911     pos = bi->next();
1912
1913     while (pos != BreakIterator::DONE) {
1914         if (expectedI >= breakPositions->size()) {
1915             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916                 testFileName, lineNumber, pos);
1917             break;
1918         }
1919         expectedPos = breakPositions->elementAti(expectedI);
1920         if (pos < expectedPos) {
1921             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922                 testFileName, lineNumber, pos);
1923             break;
1924         }
1925         if (pos > expectedPos) {
1926             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927                 testFileName, lineNumber, expectedPos);
1928             break;
1929         }
1930         pos = bi->next();
1931         expectedI++;
1932     }
1933
1934     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1935         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1937     }
1938 }
1939
1940
1941
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1944 //
1945 //   classs RBBIMonkeyKind
1946 //
1947 //      Monkey Test for Break Iteration
1948 //      Abstract interface class.   Concrete derived classes independently
1949 //      implement the break rules for different iterator types.
1950 //
1951 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1952 //      testing, but works purely in terms of the interface defined here.
1953 //
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind {
1956 public:
1957     // Return a UVector of UnicodeSets, representing the character classes used
1958     //   for this type of iterator.
1959     virtual  UVector  *charClasses() = 0;
1960
1961     // Set the test text on which subsequent calls to next() will operate
1962     virtual  void      setText(const UnicodeString &s) = 0;
1963
1964     // Find the next break postion, starting from the prev break position, or from zero.
1965     // Return -1 after reaching end of string.
1966     virtual  int32_t   next(int32_t i) = 0;
1967
1968     virtual ~RBBIMonkeyKind();
1969     UErrorCode       deferredStatus;
1970
1971
1972 protected:
1973     RBBIMonkeyKind();
1974
1975 private:
1976 };
1977
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979     deferredStatus = U_ZERO_ERROR;
1980 }
1981
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1983 }
1984
1985
1986 //----------------------------------------------------------------------------------------
1987 //
1988 //   Random Numbers.  Similar to standard lib rand() and srand()
1989 //                    Not using library to
1990 //                      1.  Get same results on all platforms.
1991 //                      2.  Get access to current seed, to more easily reproduce failures.
1992 //
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed = 1;
1995
1996 static uint32_t m_rand()
1997 {
1998     m_seed = m_seed * 1103515245 + 12345;
1999     return (uint32_t)(m_seed/65536) % 32768;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------------------
2004 //
2005 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2006 //                             of RBBIMonkeyKind.
2007 //
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey: public RBBIMonkeyKind {
2010 public:
2011     RBBICharMonkey();
2012     virtual          ~RBBICharMonkey();
2013     virtual  UVector *charClasses();
2014     virtual  void     setText(const UnicodeString &s);
2015     virtual  int32_t  next(int32_t i);
2016 private:
2017     UVector   *fSets;
2018
2019     UnicodeSet  *fCRLFSet;
2020     UnicodeSet  *fControlSet;
2021     UnicodeSet  *fExtendSet;
2022     UnicodeSet  *fRegionalIndicatorSet;
2023     UnicodeSet  *fPrependSet;
2024     UnicodeSet  *fSpacingSet;
2025     UnicodeSet  *fLSet;
2026     UnicodeSet  *fVSet;
2027     UnicodeSet  *fTSet;
2028     UnicodeSet  *fLVSet;
2029     UnicodeSet  *fLVTSet;
2030     UnicodeSet  *fHangulSet;
2031     UnicodeSet  *fAnySet;
2032     UnicodeSet  *fEmojiModifierSet;
2033     UnicodeSet  *fEmojiBaseSet;
2034     UnicodeSet  *fZWJSet;
2035     UnicodeSet  *fGAZSet;
2036
2037     const UnicodeString *fText;
2038 };
2039
2040
2041 RBBICharMonkey::RBBICharMonkey() {
2042     UErrorCode  status = U_ZERO_ERROR;
2043
2044     fText = NULL;
2045
2046     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2047     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
2048     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
2049     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2050     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2051     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2052     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2053     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2054     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2055     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2056     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2057     fHangulSet  = new UnicodeSet();
2058     fHangulSet->addAll(*fLSet);
2059     fHangulSet->addAll(*fVSet);
2060     fHangulSet->addAll(*fTSet);
2061     fHangulSet->addAll(*fLVSet);
2062     fHangulSet->addAll(*fLVTSet);
2063     fAnySet     = new UnicodeSet(0, 0x10ffff);
2064
2065
2066
2067     fEmojiBaseSet = new UnicodeSet(UnicodeString(
2068                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2069                 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2070                 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2071                 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
2072
2073     fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2074     fZWJSet           = new UnicodeSet(0x200D, 0x200D);
2075     fGAZSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status);
2076
2077     fSets       = new UVector(status);
2078     fSets->addElement(fCRLFSet,    status);
2079     fSets->addElement(fControlSet, status);
2080     fSets->addElement(fExtendSet,  status);
2081     fSets->addElement(fRegionalIndicatorSet, status);
2082     if (!fPrependSet->isEmpty()) {
2083         fSets->addElement(fPrependSet, status);
2084     }
2085     fSets->addElement(fSpacingSet, status);
2086     fSets->addElement(fHangulSet,  status);
2087     fSets->addElement(fAnySet,     status);
2088     fSets->addElement(fEmojiBaseSet, status);
2089     fSets->addElement(fEmojiModifierSet, status);
2090     fSets->addElement(fZWJSet,     status);
2091     fSets->addElement(fGAZSet,     status);
2092     if (U_FAILURE(status)) {
2093         deferredStatus = status;
2094     }
2095 }
2096
2097
2098 void RBBICharMonkey::setText(const UnicodeString &s) {
2099     fText = &s;
2100 }
2101
2102
2103
2104 int32_t RBBICharMonkey::next(int32_t prevPos) {
2105     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2106                               //   break position being tested.  The candidate break
2107                               //   location is before p2.
2108
2109     int     breakPos = -1;
2110
2111     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2112     UChar32 cBase;            // for (X Extend*) patterns, the X character.
2113
2114     if (U_FAILURE(deferredStatus)) {
2115         return -1;
2116     }
2117
2118     // Previous break at end of string.  return DONE.
2119     if (prevPos >= fText->length()) {
2120         return -1;
2121     }
2122     p0 = p1 = p2 = p3 = prevPos;
2123     c3 =  fText->char32At(prevPos);
2124     c0 = c1 = c2 = cBase = 0;
2125     (void)p0;   // suppress set but not used warning.
2126     (void)c0;
2127
2128     // Loop runs once per "significant" character position in the input text.
2129     for (;;) {
2130         // Move all of the positions forward in the input string.
2131         p0 = p1;  c0 = c1;
2132         p1 = p2;  c1 = c2;
2133         p2 = p3;  c2 = c3;
2134
2135         // Advancd p3 by one codepoint
2136         p3 = fText->moveIndex32(p3, 1);
2137         c3 = fText->char32At(p3);
2138
2139         if (p1 == p2) {
2140             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2141             continue;
2142         }
2143         if (p2 == fText->length()) {
2144             // Reached end of string.  Always a break position.
2145             break;
2146         }
2147
2148         // Rule  GB3   CR x LF
2149         //     No Extend or Format characters may appear between the CR and LF,
2150         //     which requires the additional check for p2 immediately following p1.
2151         //
2152         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2153             continue;
2154         }
2155
2156         // Rule (GB4).   ( Control | CR | LF ) <break>
2157         if (fControlSet->contains(c1) ||
2158             c1 == 0x0D ||
2159             c1 == 0x0A)  {
2160             break;
2161         }
2162
2163         // Rule (GB5)    <break>  ( Control | CR | LF )
2164         //
2165         if (fControlSet->contains(c2) ||
2166             c2 == 0x0D ||
2167             c2 == 0x0A)  {
2168             break;
2169         }
2170
2171
2172         // Rule (GB6)  L x ( L | V | LV | LVT )
2173         if (fLSet->contains(c1) &&
2174                (fLSet->contains(c2)  ||
2175                 fVSet->contains(c2)  ||
2176                 fLVSet->contains(c2) ||
2177                 fLVTSet->contains(c2))) {
2178             continue;
2179         }
2180
2181         // Rule (GB7)    ( LV | V )  x  ( V | T )
2182         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2183             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2184             continue;
2185         }
2186
2187         // Rule (GB8)    ( LVT | T)  x T
2188         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2189             fTSet->contains(c2))  {
2190             continue;
2191         }
2192
2193         // Rule (GB9)    x (Extend | ZWJ)
2194         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2195             if (!fExtendSet->contains(c1)) {
2196                 cBase = c1;
2197             }
2198             continue;
2199         }
2200
2201         // Rule (GB9a)   x  SpacingMark
2202         if (fSpacingSet->contains(c2)) {
2203             continue;
2204         }
2205
2206         // Rule (GB9b)   Prepend x
2207         if (fPrependSet->contains(c1)) {
2208             continue;
2209         }
2210
2211         // Rule (GB10)   ($E_Base | $GAZ) $Extend* $E_Modifier;
2212         if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2213             continue;
2214         }
2215         if ((fEmojiBaseSet->contains(cBase) || fGAZSet->contains(cBase)) &&
2216                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2217             continue;
2218         }
2219
2220         // Rule (GB11)   ZWJ x Glue_After_Zwj
2221         if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
2222             continue;
2223         }
2224
2225         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2226         //                   Note: The first if condition is a little tricky. We only need to force
2227         //                      a break if there are three or more contiguous RIs. If there are
2228         //                      only two, a break following will occur via other rules, and will include
2229         //                      any trailing extend characters, which is needed behavior.
2230         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2231                 && fRegionalIndicatorSet->contains(c2)) {
2232             break;
2233         }
2234         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2235             continue;
2236         }
2237
2238         // Rule (GB999)  Any  <break>  Any
2239         break;
2240     }
2241
2242     breakPos = p2;
2243     return breakPos;
2244 }
2245
2246
2247
2248 UVector  *RBBICharMonkey::charClasses() {
2249     return fSets;
2250 }
2251
2252
2253 RBBICharMonkey::~RBBICharMonkey() {
2254     delete fSets;
2255     delete fCRLFSet;
2256     delete fControlSet;
2257     delete fExtendSet;
2258     delete fRegionalIndicatorSet;
2259     delete fPrependSet;
2260     delete fSpacingSet;
2261     delete fLSet;
2262     delete fVSet;
2263     delete fTSet;
2264     delete fLVSet;
2265     delete fLVTSet;
2266     delete fHangulSet;
2267     delete fAnySet;
2268     delete fEmojiBaseSet;
2269     delete fEmojiModifierSet;
2270     delete fZWJSet;
2271     delete fGAZSet;
2272 }
2273
2274 //------------------------------------------------------------------------------------------
2275 //
2276 //   class RBBIWordMonkey      Word Break specific implementation
2277 //                             of RBBIMonkeyKind.
2278 //
2279 //------------------------------------------------------------------------------------------
2280 class RBBIWordMonkey: public RBBIMonkeyKind {
2281 public:
2282     RBBIWordMonkey();
2283     virtual          ~RBBIWordMonkey();
2284     virtual  UVector *charClasses();
2285     virtual  void     setText(const UnicodeString &s);
2286     virtual int32_t   next(int32_t i);
2287 private:
2288     UVector      *fSets;
2289
2290     UnicodeSet  *fCRSet;
2291     UnicodeSet  *fLFSet;
2292     UnicodeSet  *fNewlineSet;
2293     UnicodeSet  *fRegionalIndicatorSet;
2294     UnicodeSet  *fKatakanaSet;
2295     UnicodeSet  *fHebrew_LetterSet;
2296     UnicodeSet  *fALetterSet;
2297     // TODO(jungshik): Do we still need this change?
2298     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2299     UnicodeSet  *fSingle_QuoteSet;
2300     UnicodeSet  *fDouble_QuoteSet;
2301     UnicodeSet  *fMidNumLetSet;
2302     UnicodeSet  *fMidLetterSet;
2303     UnicodeSet  *fMidNumSet;
2304     UnicodeSet  *fNumericSet;
2305     UnicodeSet  *fFormatSet;
2306     UnicodeSet  *fOtherSet;
2307     UnicodeSet  *fExtendSet;
2308     UnicodeSet  *fExtendNumLetSet;
2309     UnicodeSet  *fDictionaryCjkSet;
2310     UnicodeSet  *fEBaseSet;
2311     UnicodeSet  *fEModifierSet;
2312     UnicodeSet  *fZWSSet;
2313     UnicodeSet  *fGAZSet;
2314
2315     const UnicodeString  *fText;
2316 };
2317
2318
2319 RBBIWordMonkey::RBBIWordMonkey()
2320 {
2321     UErrorCode  status = U_ZERO_ERROR;
2322
2323     fSets            = new UVector(status);
2324
2325     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2326     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2327     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2328     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2329     // Exclude Hangul syllables from ALetterSet during testing.
2330     // Leave CJK dictionary characters out from the monkey tests!
2331 #if 0
2332     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2333                                       "[\\p{Line_Break = Complex_Context}"
2334                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2335                                       "-\\p{Grapheme_Cluster_Break = Control}"
2336                                       "]]",
2337                                       status);
2338 #endif
2339     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2340     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2341     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2342     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2343     fALetterSet->removeAll(*fDictionaryCjkSet);
2344     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2345     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2346     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2347     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"),    status);
2348     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2349     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2350     // we should figure out why
2351     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2352     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2353     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2354     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2355
2356     fEBaseSet         = new UnicodeSet(UnicodeString(
2357                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
2358                 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
2359                 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
2360                 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
2361
2362     fEModifierSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
2363     fZWSSet          = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
2364     fGAZSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2764\\U0001F308\\U0001F466-\\U0001F469\\U0001F48B\\U0001F5E8]"), status);
2365     fExtendSet->removeAll(*fZWSSet);
2366
2367
2368     fOtherSet        = new UnicodeSet();
2369     if(U_FAILURE(status)) {
2370       deferredStatus = status;
2371       return;
2372     }
2373
2374     fOtherSet->complement();
2375     fOtherSet->removeAll(*fCRSet);
2376     fOtherSet->removeAll(*fLFSet);
2377     fOtherSet->removeAll(*fNewlineSet);
2378     fOtherSet->removeAll(*fKatakanaSet);
2379     fOtherSet->removeAll(*fHebrew_LetterSet);
2380     fOtherSet->removeAll(*fALetterSet);
2381     fOtherSet->removeAll(*fSingle_QuoteSet);
2382     fOtherSet->removeAll(*fDouble_QuoteSet);
2383     fOtherSet->removeAll(*fMidLetterSet);
2384     fOtherSet->removeAll(*fMidNumSet);
2385     fOtherSet->removeAll(*fNumericSet);
2386     fOtherSet->removeAll(*fExtendNumLetSet);
2387     fOtherSet->removeAll(*fFormatSet);
2388     fOtherSet->removeAll(*fExtendSet);
2389     fOtherSet->removeAll(*fRegionalIndicatorSet);
2390     fOtherSet->removeAll(*fEBaseSet);
2391     fOtherSet->removeAll(*fEModifierSet);
2392     fOtherSet->removeAll(*fZWSSet);
2393     fOtherSet->removeAll(*fGAZSet);
2394
2395     // Inhibit dictionary characters from being tested at all.
2396     fOtherSet->removeAll(*fDictionaryCjkSet);
2397     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2398
2399     fSets->addElement(fCRSet,                status);
2400     fSets->addElement(fLFSet,                status);
2401     fSets->addElement(fNewlineSet,           status);
2402     fSets->addElement(fRegionalIndicatorSet, status);
2403     fSets->addElement(fHebrew_LetterSet,     status);
2404     fSets->addElement(fALetterSet,           status);
2405     fSets->addElement(fSingle_QuoteSet,      status);
2406     fSets->addElement(fDouble_QuoteSet,      status);
2407     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2408     fSets->addElement(fMidLetterSet,         status);
2409     fSets->addElement(fMidNumLetSet,         status);
2410     fSets->addElement(fMidNumSet,            status);
2411     fSets->addElement(fNumericSet,           status);
2412     fSets->addElement(fFormatSet,            status);
2413     fSets->addElement(fExtendSet,            status);
2414     fSets->addElement(fOtherSet,             status);
2415     fSets->addElement(fExtendNumLetSet,      status);
2416
2417     fSets->addElement(fEBaseSet,             status);
2418     fSets->addElement(fEModifierSet,         status);
2419     fSets->addElement(fZWSSet,               status);
2420     fSets->addElement(fGAZSet,               status);
2421
2422     if (U_FAILURE(status)) {
2423         deferredStatus = status;
2424     }
2425 }
2426
2427 void RBBIWordMonkey::setText(const UnicodeString &s) {
2428     fText       = &s;
2429 }
2430
2431
2432 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2433     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2434                               //   break position being tested.  The candidate break
2435                               //   location is before p2.
2436
2437     int     breakPos = -1;
2438
2439     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2440
2441     if (U_FAILURE(deferredStatus)) {
2442         return -1;
2443     }
2444
2445     // Prev break at end of string.  return DONE.
2446     if (prevPos >= fText->length()) {
2447         return -1;
2448     }
2449     p0 = p1 = p2 = p3 = prevPos;
2450     c3 =  fText->char32At(prevPos);
2451     c0 = c1 = c2 = 0;
2452     (void)p0;       // Suppress set but not used warning.
2453
2454     // Loop runs once per "significant" character position in the input text.
2455     for (;;) {
2456         // Move all of the positions forward in the input string.
2457         p0 = p1;  c0 = c1;
2458         p1 = p2;  c1 = c2;
2459         p2 = p3;  c2 = c3;
2460
2461         // Advancd p3 by    X(Extend | Format)*   Rule 4
2462         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2463         do {
2464             p3 = fText->moveIndex32(p3, 1);
2465             c3 = fText->char32At(p3);
2466             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2467                break;
2468             };
2469         }
2470         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
2471
2472
2473         if (p1 == p2) {
2474             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2475             continue;
2476         }
2477         if (p2 == fText->length()) {
2478             // Reached end of string.  Always a break position.
2479             break;
2480         }
2481
2482         // Rule  (3)   CR x LF
2483         //     No Extend or Format characters may appear between the CR and LF,
2484         //     which requires the additional check for p2 immediately following p1.
2485         //
2486         if (c1==0x0D && c2==0x0A) {
2487             continue;
2488         }
2489
2490         // Rule (3a)  Break before and after newlines (including CR and LF)
2491         //
2492         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2493             break;
2494         };
2495         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2496             break;
2497         };
2498
2499         // Rule (3c)    ZWJ x GAZ (Glue after ZWJ).
2500         //              Not ignoring extend chars, so peek into input text to
2501         //              get the potential ZWJ, the character immediately preceding c2.
2502         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2503         //              but char32At will get the full code point.
2504         if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
2505             continue;
2506         }
2507
2508         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2509         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2510             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2511             continue;
2512         }
2513
2514         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2515         //
2516         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2517              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2518              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2519             continue;
2520         }
2521
2522         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2523         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2524             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2525             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2526             continue;
2527         }
2528
2529         // Rule (7a)     Hebrew_Letter x Single_Quote
2530         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2531             continue;
2532         }
2533
2534         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2535         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2536             continue;
2537         }
2538
2539         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2540         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2541             continue;
2542         }
2543
2544         // Rule (8)    Numeric x Numeric
2545         if (fNumericSet->contains(c1) &&
2546             fNumericSet->contains(c2))  {
2547             continue;
2548         }
2549
2550         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2551         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2552             fNumericSet->contains(c2))  {
2553             continue;
2554         }
2555
2556         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2557         if (fNumericSet->contains(c1) &&
2558             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2559             continue;
2560         }
2561
2562         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2563         if (fNumericSet->contains(c0) &&
2564             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2565             fNumericSet->contains(c2)) {
2566             continue;
2567         }
2568
2569         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2570         if (fNumericSet->contains(c1) &&
2571             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2572             fNumericSet->contains(c3)) {
2573             continue;
2574         }
2575
2576         // Rule (13)  Katakana x Katakana
2577         if (fKatakanaSet->contains(c1) &&
2578             fKatakanaSet->contains(c2))  {
2579             continue;
2580         }
2581
2582         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2583         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2584              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2585              fExtendNumLetSet->contains(c2)) {
2586                 continue;
2587         }
2588
2589         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2590         if (fExtendNumLetSet->contains(c1) &&
2591                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2592                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2593             continue;
2594         }
2595
2596         // Rule 13c
2597         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2598             break;
2599         }
2600         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2601             continue;
2602         }
2603
2604         // Rule 13d
2605         if ((fEBaseSet->contains(c1)  || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
2606             continue;
2607         }
2608
2609         // Rule 14.  Break found here.
2610         break;
2611     }
2612
2613     breakPos = p2;
2614     return breakPos;
2615 }
2616
2617
2618 UVector  *RBBIWordMonkey::charClasses() {
2619     return fSets;
2620 }
2621
2622
2623 RBBIWordMonkey::~RBBIWordMonkey() {
2624     delete fSets;
2625     delete fCRSet;
2626     delete fLFSet;
2627     delete fNewlineSet;
2628     delete fKatakanaSet;
2629     delete fHebrew_LetterSet;
2630     delete fALetterSet;
2631     delete fSingle_QuoteSet;
2632     delete fDouble_QuoteSet;
2633     delete fMidNumLetSet;
2634     delete fMidLetterSet;
2635     delete fMidNumSet;
2636     delete fNumericSet;
2637     delete fFormatSet;
2638     delete fExtendSet;
2639     delete fExtendNumLetSet;
2640     delete fRegionalIndicatorSet;
2641     delete fDictionaryCjkSet;
2642     delete fOtherSet;
2643     delete fEBaseSet;
2644     delete fEModifierSet;
2645     delete fZWSSet;
2646     delete fGAZSet;
2647 }
2648
2649
2650
2651
2652 //------------------------------------------------------------------------------------------
2653 //
2654 //   class RBBISentMonkey      Sentence Break specific implementation
2655 //                             of RBBIMonkeyKind.
2656 //
2657 //------------------------------------------------------------------------------------------
2658 class RBBISentMonkey: public RBBIMonkeyKind {
2659 public:
2660     RBBISentMonkey();
2661     virtual          ~RBBISentMonkey();
2662     virtual  UVector *charClasses();
2663     virtual  void     setText(const UnicodeString &s);
2664     virtual int32_t   next(int32_t i);
2665 private:
2666     int               moveBack(int posFrom);
2667     int               moveForward(int posFrom);
2668     UChar32           cAt(int pos);
2669
2670     UVector      *fSets;
2671
2672     UnicodeSet  *fSepSet;
2673     UnicodeSet  *fFormatSet;
2674     UnicodeSet  *fSpSet;
2675     UnicodeSet  *fLowerSet;
2676     UnicodeSet  *fUpperSet;
2677     UnicodeSet  *fOLetterSet;
2678     UnicodeSet  *fNumericSet;
2679     UnicodeSet  *fATermSet;
2680     UnicodeSet  *fSContinueSet;
2681     UnicodeSet  *fSTermSet;
2682     UnicodeSet  *fCloseSet;
2683     UnicodeSet  *fOtherSet;
2684     UnicodeSet  *fExtendSet;
2685
2686     const UnicodeString  *fText;
2687
2688 };
2689
2690 RBBISentMonkey::RBBISentMonkey()
2691 {
2692     UErrorCode  status = U_ZERO_ERROR;
2693
2694     fSets            = new UVector(status);
2695
2696     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2697     //                       set and made into character classes of their own.  For the monkey impl,
2698     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2699     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2700     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2701     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2702     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2703     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2704     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2705     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2706     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2707     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2708     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2709     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2710     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2711     fOtherSet        = new UnicodeSet();
2712
2713     if(U_FAILURE(status)) {
2714       deferredStatus = status;
2715       return;
2716     }
2717
2718     fOtherSet->complement();
2719     fOtherSet->removeAll(*fSepSet);
2720     fOtherSet->removeAll(*fFormatSet);
2721     fOtherSet->removeAll(*fSpSet);
2722     fOtherSet->removeAll(*fLowerSet);
2723     fOtherSet->removeAll(*fUpperSet);
2724     fOtherSet->removeAll(*fOLetterSet);
2725     fOtherSet->removeAll(*fNumericSet);
2726     fOtherSet->removeAll(*fATermSet);
2727     fOtherSet->removeAll(*fSContinueSet);
2728     fOtherSet->removeAll(*fSTermSet);
2729     fOtherSet->removeAll(*fCloseSet);
2730     fOtherSet->removeAll(*fExtendSet);
2731
2732     fSets->addElement(fSepSet,       status);
2733     fSets->addElement(fFormatSet,    status);
2734     fSets->addElement(fSpSet,        status);
2735     fSets->addElement(fLowerSet,     status);
2736     fSets->addElement(fUpperSet,     status);
2737     fSets->addElement(fOLetterSet,   status);
2738     fSets->addElement(fNumericSet,   status);
2739     fSets->addElement(fATermSet,     status);
2740     fSets->addElement(fSContinueSet, status);
2741     fSets->addElement(fSTermSet,     status);
2742     fSets->addElement(fCloseSet,     status);
2743     fSets->addElement(fOtherSet,     status);
2744     fSets->addElement(fExtendSet,    status);
2745
2746     if (U_FAILURE(status)) {
2747         deferredStatus = status;
2748     }
2749 }
2750
2751
2752
2753 void RBBISentMonkey::setText(const UnicodeString &s) {
2754     fText       = &s;
2755 }
2756
2757 UVector  *RBBISentMonkey::charClasses() {
2758     return fSets;
2759 }
2760
2761
2762 //  moveBack()   Find the "significant" code point preceding the index i.
2763 //               Skips over ($Extend | $Format)* .
2764 //
2765 int RBBISentMonkey::moveBack(int i) {
2766     if (i <= 0) {
2767         return -1;
2768     }
2769     UChar32   c;
2770     int32_t   j = i;
2771     do {
2772         j = fText->moveIndex32(j, -1);
2773         c = fText->char32At(j);
2774     }
2775     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2776     return j;
2777
2778  }
2779
2780
2781 int RBBISentMonkey::moveForward(int i) {
2782     if (i>=fText->length()) {
2783         return fText->length();
2784     }
2785     UChar32   c;
2786     int32_t   j = i;
2787     do {
2788         j = fText->moveIndex32(j, 1);
2789         c = cAt(j);
2790     }
2791     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2792     return j;
2793 }
2794
2795 UChar32 RBBISentMonkey::cAt(int pos) {
2796     if (pos<0 || pos>=fText->length()) {
2797         return -1;
2798     } else {
2799         return fText->char32At(pos);
2800     }
2801 }
2802
2803 int32_t RBBISentMonkey::next(int32_t prevPos) {
2804     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2805                               //   break position being tested.  The candidate break
2806                               //   location is before p2.
2807
2808     int     breakPos = -1;
2809
2810     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2811     UChar32 c;
2812
2813     if (U_FAILURE(deferredStatus)) {
2814         return -1;
2815     }
2816
2817     // Prev break at end of string.  return DONE.
2818     if (prevPos >= fText->length()) {
2819         return -1;
2820     }
2821     p0 = p1 = p2 = p3 = prevPos;
2822     c3 =  fText->char32At(prevPos);
2823     c0 = c1 = c2 = 0;
2824     (void)p0;     // Suppress set but not used warning.
2825
2826     // Loop runs once per "significant" character position in the input text.
2827     for (;;) {
2828         // Move all of the positions forward in the input string.
2829         p0 = p1;  c0 = c1;
2830         p1 = p2;  c1 = c2;
2831         p2 = p3;  c2 = c3;
2832
2833         // Advancd p3 by    X(Extend | Format)*   Rule 4
2834         p3 = moveForward(p3);
2835         c3 = cAt(p3);
2836
2837         // Rule (3)  CR x LF
2838         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2839             continue;
2840         }
2841
2842         // Rule (4).   Sep  <break>
2843         if (fSepSet->contains(c1)) {
2844             p2 = p1+1;   // Separators don't combine with Extend or Format.
2845             break;
2846         }
2847
2848         if (p2 >= fText->length()) {
2849             // Reached end of string.  Always a break position.
2850             break;
2851         }
2852
2853         if (p2 == prevPos) {
2854             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2855             continue;
2856         }
2857
2858         // Rule (6).   ATerm x Numeric
2859         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2860             continue;
2861         }
2862
2863         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2864         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2865                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2866             continue;
2867         }
2868
2869         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2870         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2871         //                  note to the Unicode 5.0 documents.
2872         int p8 = p1;
2873         while (fSpSet->contains(cAt(p8))) {
2874             p8 = moveBack(p8);
2875         }
2876         while (fCloseSet->contains(cAt(p8))) {
2877             p8 = moveBack(p8);
2878         }
2879         if (fATermSet->contains(cAt(p8))) {
2880             p8=p2;
2881             for (;;) {
2882                 c = cAt(p8);
2883                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2884                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2885                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2886                     break;
2887                 }
2888                 p8 = moveForward(p8);
2889             }
2890             if (fLowerSet->contains(cAt(p8))) {
2891                 continue;
2892             }
2893         }
2894
2895         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2896         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2897             p8 = p1;
2898             while (fSpSet->contains(cAt(p8))) {
2899                 p8 = moveBack(p8);
2900             }
2901             while (fCloseSet->contains(cAt(p8))) {
2902                 p8 = moveBack(p8);
2903             }
2904             c = cAt(p8);
2905             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2906                 continue;
2907             }
2908         }
2909
2910         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2911         int p9 = p1;
2912         while (fCloseSet->contains(cAt(p9))) {
2913             p9 = moveBack(p9);
2914         }
2915         c = cAt(p9);
2916         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2917             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2918                 continue;
2919             }
2920         }
2921
2922         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2923         int p10 = p1;
2924         while (fSpSet->contains(cAt(p10))) {
2925             p10 = moveBack(p10);
2926         }
2927         while (fCloseSet->contains(cAt(p10))) {
2928             p10 = moveBack(p10);
2929         }
2930         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2931             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2932                 continue;
2933             }
2934         }
2935
2936         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2937         int p11 = p1;
2938         if (fSepSet->contains(cAt(p11))) {
2939             p11 = moveBack(p11);
2940         }
2941         while (fSpSet->contains(cAt(p11))) {
2942             p11 = moveBack(p11);
2943         }
2944         while (fCloseSet->contains(cAt(p11))) {
2945             p11 = moveBack(p11);
2946         }
2947         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2948             break;
2949         }
2950
2951         //  Rule (12)  Any x Any
2952         continue;
2953     }
2954     breakPos = p2;
2955     return breakPos;
2956 }
2957
2958 RBBISentMonkey::~RBBISentMonkey() {
2959     delete fSets;
2960     delete fSepSet;
2961     delete fFormatSet;
2962     delete fSpSet;
2963     delete fLowerSet;
2964     delete fUpperSet;
2965     delete fOLetterSet;
2966     delete fNumericSet;
2967     delete fATermSet;
2968     delete fSContinueSet;
2969     delete fSTermSet;
2970     delete fCloseSet;
2971     delete fOtherSet;
2972     delete fExtendSet;
2973 }
2974
2975
2976
2977 //-------------------------------------------------------------------------------------------
2978 //
2979 //  RBBILineMonkey
2980 //
2981 //-------------------------------------------------------------------------------------------
2982
2983 class RBBILineMonkey: public RBBIMonkeyKind {
2984 public:
2985     RBBILineMonkey();
2986     virtual          ~RBBILineMonkey();
2987     virtual  UVector *charClasses();
2988     virtual  void     setText(const UnicodeString &s);
2989     virtual  int32_t  next(int32_t i);
2990     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2991 private:
2992     UVector      *fSets;
2993
2994     UnicodeSet  *fBK;
2995     UnicodeSet  *fCR;
2996     UnicodeSet  *fLF;
2997     UnicodeSet  *fCM;
2998     UnicodeSet  *fNL;
2999     UnicodeSet  *fSG;
3000     UnicodeSet  *fWJ;
3001     UnicodeSet  *fZW;
3002     UnicodeSet  *fGL;
3003     UnicodeSet  *fCB;
3004     UnicodeSet  *fSP;
3005     UnicodeSet  *fB2;
3006     UnicodeSet  *fBA;
3007     UnicodeSet  *fBB;
3008     UnicodeSet  *fHY;
3009     UnicodeSet  *fH2;
3010     UnicodeSet  *fH3;
3011     UnicodeSet  *fCL;
3012     UnicodeSet  *fCP;
3013     UnicodeSet  *fEX;
3014     UnicodeSet  *fIN;
3015     UnicodeSet  *fJL;
3016     UnicodeSet  *fJV;
3017     UnicodeSet  *fJT;
3018     UnicodeSet  *fNS;
3019     UnicodeSet  *fOP;
3020     UnicodeSet  *fQU;
3021     UnicodeSet  *fIS;
3022     UnicodeSet  *fNU;
3023     UnicodeSet  *fPO;
3024     UnicodeSet  *fPR;
3025     UnicodeSet  *fSY;
3026     UnicodeSet  *fAI;
3027     UnicodeSet  *fAL;
3028     UnicodeSet  *fCJ;
3029     UnicodeSet  *fHL;
3030     UnicodeSet  *fID;
3031     UnicodeSet  *fRI;
3032     UnicodeSet  *fXX;
3033     UnicodeSet  *fEB;
3034     UnicodeSet  *fEM;
3035     UnicodeSet  *fZJ;
3036
3037     BreakIterator        *fCharBI;
3038     const UnicodeString  *fText;
3039     RegexMatcher         *fNumberMatcher;
3040 };
3041
3042 RBBILineMonkey::RBBILineMonkey() :
3043     RBBIMonkeyKind(),
3044     fSets(NULL),
3045
3046     fCharBI(NULL),
3047     fText(NULL),
3048     fNumberMatcher(NULL)
3049
3050 {
3051     if (U_FAILURE(deferredStatus)) {
3052         return;
3053     }
3054
3055     UErrorCode  status = U_ZERO_ERROR;
3056
3057     fSets  = new UVector(status);
3058
3059     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3060     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3061     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3062     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3063     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3064     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3065     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3066     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3067     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3068     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3069     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3070     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3071     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3072     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3073     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3074     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3075     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3076     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3077     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3078     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3079     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3080     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3081     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3082     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3083     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3084     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3085     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3086     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3087     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3088     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3089     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3090     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3091     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3092     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3093     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3094     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3095     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3096     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3097     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3098     fEB    = new UnicodeSet(UnicodeString(
3099                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C3-\\U0001F3C4\\U0001F3CA-\\U0001F3CB\\U0001F442-\\U0001F443"
3100                 "\\U0001F446-\\U0001F450\\U0001F466-\\U0001F469\\U0001F46E\\U0001F470-\\U0001F478\\U0001F47C\\U0001F481-\\U0001F483"
3101                 "\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F575\\U0001F590\\U0001F595-\\U0001F596\\U0001F645-\\U0001F647"
3102                 "\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F918]"), status);
3103     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
3104     fZJ    = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
3105
3106     if (U_FAILURE(status)) {
3107         deferredStatus = status;
3108         return;
3109     }
3110
3111     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3112     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3113     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3114
3115     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3116
3117     fID->addAll(*fEB);     // Emoji Base and Emoji Modifier behave as ID.
3118     fID->addAll(*fEM);
3119     fAL->removeAll(*fEM);
3120
3121
3122     fAL->remove((UChar32)0x2764);   // Emoji Proposal: move u2764 from Al to Id
3123     fAI->remove((UChar32)0x2640);   // new ZWJ seqs
3124     fAI->remove((UChar32)0x2642);   // new ZWJ seqs
3125     fID->add((UChar32)0x2764);
3126     fID->add((UChar32)0x2640);
3127     fID->add((UChar32)0x2642);
3128
3129     fSets->addElement(fBK, status);
3130     fSets->addElement(fCR, status);
3131     fSets->addElement(fLF, status);
3132     fSets->addElement(fCM, status);
3133     fSets->addElement(fNL, status);
3134     fSets->addElement(fWJ, status);
3135     fSets->addElement(fZW, status);
3136     fSets->addElement(fGL, status);
3137     fSets->addElement(fCB, status);
3138     fSets->addElement(fSP, status);
3139     fSets->addElement(fB2, status);
3140     fSets->addElement(fBA, status);
3141     fSets->addElement(fBB, status);
3142     fSets->addElement(fHY, status);
3143     fSets->addElement(fH2, status);
3144     fSets->addElement(fH3, status);
3145     fSets->addElement(fCL, status);
3146     fSets->addElement(fCP, status);
3147     fSets->addElement(fEX, status);
3148     fSets->addElement(fIN, status);
3149     fSets->addElement(fJL, status);
3150     fSets->addElement(fJT, status);
3151     fSets->addElement(fJV, status);
3152     fSets->addElement(fNS, status);
3153     fSets->addElement(fOP, status);
3154     fSets->addElement(fQU, status);
3155     fSets->addElement(fIS, status);
3156     fSets->addElement(fNU, status);
3157     fSets->addElement(fPO, status);
3158     fSets->addElement(fPR, status);
3159     fSets->addElement(fSY, status);
3160     fSets->addElement(fAI, status);
3161     fSets->addElement(fAL, status);
3162     fSets->addElement(fHL, status);
3163     fSets->addElement(fID, status);
3164     fSets->addElement(fWJ, status);
3165     fSets->addElement(fRI, status);
3166     fSets->addElement(fSG, status);
3167     fSets->addElement(fEB, status);
3168     fSets->addElement(fEM, status);
3169     fSets->addElement(fZJ, status);
3170
3171     const char *rules =
3172             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3173             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3174             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3175             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3176             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3177             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3178
3179     fNumberMatcher = new RegexMatcher(
3180         UnicodeString(rules, -1, US_INV), 0, status);
3181
3182     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3183
3184     if (U_FAILURE(status)) {
3185         deferredStatus = status;
3186     }
3187 }
3188
3189
3190 void RBBILineMonkey::setText(const UnicodeString &s) {
3191     fText       = &s;
3192     fCharBI->setText(s);
3193     fNumberMatcher->reset(s);
3194 }
3195
3196 //
3197 //  rule9Adjust
3198 //     Line Break TR rules 9 and 10 implementation.
3199 //     This deals with combining marks and other sequences that
3200 //     that must be treated as if they were something other than what they actually are.
3201 //
3202 //     This is factored out into a separate function because it must be applied twice for
3203 //     each potential break, once to the chars before the position being checked, then
3204 //     again to the text following the possible break.
3205 //
3206 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3207     if (pos == -1) {
3208         // Invalid initial position.  Happens during the warmup iteration of the
3209         //   main loop in next().
3210         return;
3211     }
3212
3213     int32_t  nPos = *nextPos;
3214
3215     // LB 9  Keep combining sequences together.
3216     //  advance over any CM class chars.  Note that Line Break CM is different
3217     //  from the normal Grapheme Extend property.
3218     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3219           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3220         for (;;) {
3221             *nextChar = fText->char32At(nPos);
3222             if (!fCM->contains(*nextChar)) {
3223                 break;
3224             }
3225             nPos = fText->moveIndex32(nPos, 1);
3226         }
3227     }
3228
3229
3230     // LB 9 Treat X CM* as if it were x.
3231     //       No explicit action required.
3232
3233     // LB 10  Treat any remaining combining mark as AL
3234     if (fCM->contains(*posChar)) {
3235         *posChar = 0x41;   // thisChar = 'A';
3236     }
3237
3238     // Push the updated nextPos and nextChar back to our caller.
3239     // This only makes a difference if posChar got bigger by consuming a
3240     // combining sequence.
3241     *nextPos  = nPos;
3242     *nextChar = fText->char32At(nPos);
3243 }
3244
3245
3246
3247 int32_t RBBILineMonkey::next(int32_t startPos) {
3248     UErrorCode status = U_ZERO_ERROR;
3249     int32_t    pos;       //  Index of the char following a potential break position
3250     UChar32    thisChar;  //  Character at above position "pos"
3251
3252     int32_t    prevPos;   //  Index of the char preceding a potential break position
3253     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3254                           //   and thisChar may not be adjacent because combining
3255                           //   characters between them will be ignored.
3256
3257     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3258     UChar32    prevCharX2;
3259
3260     int32_t    nextPos;   //  Index of the next character following pos.
3261                           //     Usually skips over combining marks.
3262     int32_t    nextCPPos; //  Index of the code point following "pos."
3263                           //     May point to a combining mark.
3264     int32_t    tPos;      //  temp value.
3265     UChar32    c;
3266
3267     if (U_FAILURE(deferredStatus)) {
3268         return -1;
3269     }
3270
3271     if (startPos >= fText->length()) {
3272         return -1;
3273     }
3274
3275
3276     // Initial values for loop.  Loop will run the first time without finding breaks,
3277     //                           while the invalid values shift out and the "this" and
3278     //                           "prev" positions are filled in with good values.
3279     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3280     thisChar = prevChar  = prevCharX2 = 0;
3281     nextPos  = nextCPPos = startPos;
3282
3283
3284     // Loop runs once per position in the test text, until a break position
3285     //  is found.
3286     for (;;) {
3287         prevPosX2 = prevPos;
3288         prevCharX2 = prevChar;
3289
3290         prevPos   = pos;
3291         prevChar  = thisChar;
3292
3293         pos       = nextPos;
3294         thisChar  = fText->char32At(pos);
3295
3296         nextCPPos = fText->moveIndex32(pos, 1);
3297         nextPos   = nextCPPos;
3298
3299         // Rule LB2 - Break at end of text.
3300         if (pos >= fText->length()) {
3301             break;
3302         }
3303
3304         // Rule LB 9 - adjust for combining sequences.
3305         //             We do this one out-of-order because the adjustment does not change anything
3306         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3307         //             be applied.
3308         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3309         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3310         c = fText->char32At(nextPos);
3311         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3312
3313         // If the loop is still warming up - if we haven't shifted the initial
3314         //   -1 positions out of prevPos yet - loop back to advance the
3315         //    position in the input without any further looking for breaks.
3316         if (prevPos == -1) {
3317             continue;
3318         }
3319
3320         // LB 4  Always break after hard line breaks,
3321         if (fBK->contains(prevChar)) {
3322             break;
3323         }
3324
3325         // LB 5  Break after CR, LF, NL, but not inside CR LF
3326         if (prevChar == 0x0d && thisChar == 0x0a) {
3327             continue;
3328         }
3329         if (prevChar == 0x0d ||
3330             prevChar == 0x0a ||
3331             prevChar == 0x85)  {
3332             break;
3333         }
3334
3335         // LB 6  Don't break before hard line breaks
3336         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3337             fBK->contains(thisChar)) {
3338                 continue;
3339         }
3340
3341
3342         // LB 7  Don't break before spaces or zero-width space.
3343         if (fSP->contains(thisChar)) {
3344             continue;
3345         }
3346
3347         if (fZW->contains(thisChar)) {
3348             continue;
3349         }
3350
3351         // LB 8  Break after zero width space
3352         if (fZW->contains(prevChar)) {
3353             break;
3354         }
3355
3356         // LB 8a ZJ x ID
3357         //       The monkey test's way of ignoring combining characters doesn't work
3358         //       for this rule. ZJ is also a CM. Need to get the actual character
3359         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3360         {
3361             int32_t prevIdx = fText->moveIndex32(pos, -1);
3362             UChar32 prevC = fText->char32At(prevIdx);
3363             if (fZJ->contains(prevC) && fID->contains(thisChar)) {
3364                 continue;
3365             }
3366         }
3367
3368         // LB 9, 10  Already done, at top of loop.
3369         //
3370
3371
3372         // LB 11  Do not break before or after WORD JOINER and related characters.
3373         //    x  WJ
3374         //    WJ  x
3375         //
3376         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3377             continue;
3378         }
3379
3380         // LB 12
3381         //    GL  x
3382         if (fGL->contains(prevChar)) {
3383             continue;
3384         }
3385
3386         // LB 12a
3387         //    [^SP BA HY] x GL
3388         if (!(fSP->contains(prevChar) ||
3389               fBA->contains(prevChar) ||
3390               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3391             continue;
3392         }
3393
3394
3395
3396         // LB 13  Don't break before closings.
3397         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3398         //        fall into LB 17 and the more general number regular expression.
3399         //
3400         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3401             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3402                                          fEX->contains(thisChar)  ||
3403             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3404             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3405             continue;
3406         }
3407
3408         // LB 14 Don't break after OP SP*
3409         //       Scan backwards, checking for this sequence.
3410         //       The OP char could include combining marks, so we actually check for
3411         //           OP CM* SP*
3412         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3413         //       sequence into a ID char, so before scanning back through spaces,
3414         //       verify that prevChar is indeed a space.  The prevChar variable
3415         //       may differ from fText[prevPos]
3416         tPos = prevPos;
3417         if (fSP->contains(prevChar)) {
3418             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3419                 tPos=fText->moveIndex32(tPos, -1);
3420             }
3421         }
3422         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3423             tPos=fText->moveIndex32(tPos, -1);
3424         }
3425         if (fOP->contains(fText->char32At(tPos))) {
3426             continue;
3427         }
3428
3429
3430         // LB 15    QU SP* x OP
3431         if (fOP->contains(thisChar)) {
3432             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3433             int tPos = prevPos;
3434             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3435                 tPos = fText->moveIndex32(tPos, -1);
3436             }
3437             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3438                 tPos = fText->moveIndex32(tPos, -1);
3439             }
3440             if (fQU->contains(fText->char32At(tPos))) {
3441                 continue;
3442             }
3443         }
3444
3445
3446
3447         // LB 16   (CL | CP) SP* x NS
3448         //    Scan backwards for SP* CM* (CL | CP)
3449         if (fNS->contains(thisChar)) {
3450             int tPos = prevPos;
3451             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3452                 tPos = fText->moveIndex32(tPos, -1);
3453             }
3454             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3455                 tPos = fText->moveIndex32(tPos, -1);
3456             }
3457             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3458                 continue;
3459             }
3460         }
3461
3462
3463         // LB 17        B2 SP* x B2
3464         if (fB2->contains(thisChar)) {
3465             //  Scan backwards, checking for the B2 CM* SP* sequence.
3466             tPos = prevPos;
3467             if (fSP->contains(prevChar)) {
3468                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3469                     tPos=fText->moveIndex32(tPos, -1);
3470                 }
3471             }
3472             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3473                 tPos=fText->moveIndex32(tPos, -1);
3474             }
3475             if (fB2->contains(fText->char32At(tPos))) {
3476                 continue;
3477             }
3478         }
3479
3480
3481         // LB 18    break after space
3482         if (fSP->contains(prevChar)) {
3483             break;
3484         }
3485
3486         // LB 19
3487         //    x   QU
3488         //    QU  x
3489         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3490             continue;
3491         }
3492
3493         // LB 20  Break around a CB
3494         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3495             break;
3496         }
3497
3498         // LB 21
3499         if (fBA->contains(thisChar) ||
3500             fHY->contains(thisChar) ||
3501             fNS->contains(thisChar) ||
3502             fBB->contains(prevChar) )   {
3503             continue;
3504         }
3505
3506         // LB 21a
3507         //   HL (HY | BA) x
3508         if (fHL->contains(prevCharX2) &&
3509                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3510             continue;
3511         }
3512
3513         // LB 21b
3514         //   SY x HL
3515         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3516             continue;
3517         }
3518
3519         // LB 22
3520         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3521             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3522             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3523             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3524             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3525             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3526             continue;
3527         }
3528
3529
3530         // LB 23    ID x PO
3531         //          AL x NU
3532         //          HL x NU
3533         //          NU x AL
3534         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3535             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3536             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3537             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3538             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3539             continue;
3540         }
3541
3542         // LB 24  Do not break between prefix and letters or ideographs.
3543         //        PR x ID
3544         //        PR x (AL | HL)
3545         //        PO x (AL | HL)
3546         //        (AL | HL) x PR        // Apple early addition
3547         //        (AL | HL) x PO        // Apple early addition
3548         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3549             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3550             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3551             ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPR->contains(thisChar)) ||
3552             ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPO->contains(thisChar)) ) {
3553             continue;
3554         }
3555
3556
3557
3558         // LB 25    Numbers
3559         if (fNumberMatcher->lookingAt(prevPos, status)) {
3560             if (U_FAILURE(status)) {
3561                 break;
3562             }
3563             // Matched a number.  But could have been just a single digit, which would
3564             //    not represent a "no break here" between prevChar and thisChar
3565             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3566             if (numEndIdx > pos) {
3567                 // Number match includes at least our two chars being checked
3568                 if (numEndIdx > nextPos) {
3569                     // Number match includes additional chars.  Update pos and nextPos
3570                     //   so that next loop iteration will continue at the end of the number,
3571                     //   checking for breaks between last char in number & whatever follows.
3572                     pos = nextPos = numEndIdx;
3573                     do {
3574                         pos = fText->moveIndex32(pos, -1);
3575                         thisChar = fText->char32At(pos);
3576                     } while (fCM->contains(thisChar));
3577                 }
3578                 continue;
3579             }
3580         }
3581
3582
3583         // LB 26 Do not break a Korean syllable.
3584         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3585                                         fJV->contains(thisChar) ||
3586                                         fH2->contains(thisChar) ||
3587                                         fH3->contains(thisChar))) {
3588                                             continue;
3589                                         }
3590
3591         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3592             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3593                 continue;
3594         }
3595
3596         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3597             fJT->contains(thisChar)) {
3598                 continue;
3599         }
3600
3601         // LB 27 Treat a Korean Syllable Block the same as ID.
3602         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3603             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3604             fIN->contains(thisChar)) {
3605                 continue;
3606             }
3607         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3608             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3609             fPO->contains(thisChar)) {
3610                 continue;
3611             }
3612         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3613             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3614                 continue;
3615             }
3616
3617
3618
3619         // LB 28  Do not break between alphabetics ("at").
3620         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3621             continue;
3622         }
3623
3624         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3625         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3626             continue;
3627         }
3628
3629         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3630         //          (AL | NU) x OP
3631         //          CP x (AL | NU)
3632         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3633             continue;
3634         }
3635         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3636             continue;
3637         }
3638
3639         // LB30a    RI RI <break> RI
3640         //             RI    x    RI
3641         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3642             break;
3643         }
3644         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3645             continue;
3646         }
3647
3648         // LB30b    Emoji Base x Emoji Modifier
3649         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3650             continue;
3651         }
3652
3653         // LB 31    Break everywhere else
3654         break;
3655
3656     }
3657
3658     return pos;
3659 }
3660
3661
3662 UVector  *RBBILineMonkey::charClasses() {
3663     return fSets;
3664 }
3665
3666
3667 RBBILineMonkey::~RBBILineMonkey() {
3668     delete fSets;
3669
3670     delete fBK;
3671     delete fCR;
3672     delete fLF;
3673     delete fCM;
3674     delete fNL;
3675     delete fWJ;
3676     delete fZW;
3677     delete fGL;
3678     delete fCB;
3679     delete fSP;
3680     delete fB2;
3681     delete fBA;
3682     delete fBB;
3683     delete fHY;
3684     delete fH2;
3685     delete fH3;
3686     delete fCL;
3687     delete fCP;
3688     delete fEX;
3689     delete fIN;
3690     delete fJL;
3691     delete fJV;
3692     delete fJT;
3693     delete fNS;
3694     delete fOP;
3695     delete fQU;
3696     delete fIS;
3697     delete fNU;
3698     delete fPO;
3699     delete fPR;
3700     delete fSY;
3701     delete fAI;
3702     delete fAL;
3703     delete fCJ;
3704     delete fHL;
3705     delete fID;
3706     delete fRI;
3707     delete fSG;
3708     delete fXX;
3709     delete fEB;
3710     delete fEM;
3711     delete fZJ;
3712
3713     delete fCharBI;
3714     delete fNumberMatcher;
3715 }
3716
3717
3718 //-------------------------------------------------------------------------------------------
3719 //
3720 //   TestMonkey
3721 //
3722 //     params
3723 //       seed=nnnnn        Random number starting seed.
3724 //                         Setting the seed allows errors to be reproduced.
3725 //       loop=nnn          Looping count.  Controls running time.
3726 //                         -1:  run forever.
3727 //                          0 or greater:  run length.
3728 //
3729 //       type = char | word | line | sent | title
3730 //
3731 //  Example:
3732 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3733 //
3734 //-------------------------------------------------------------------------------------------
3735
3736 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3737     int32_t val = defaultVal;
3738     name.append(" *= *(-?\\d+)");
3739     UErrorCode status = U_ZERO_ERROR;
3740     RegexMatcher m(name, params, 0, status);
3741     if (m.find()) {
3742         // The param exists.  Convert the string to an int.
3743         char valString[100];
3744         int32_t paramLength = m.end(1, status) - m.start(1, status);
3745         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3746             paramLength = (int32_t)(sizeof(valString)-2);
3747         }
3748         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3749         val = strtol(valString,  NULL, 10);
3750
3751         // Delete this parameter from the params string.
3752         m.reset();
3753         params = m.replaceFirst("", status);
3754     }
3755     U_ASSERT(U_SUCCESS(status));
3756     return val;
3757 }
3758 #endif
3759
3760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3761 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3762                                     BreakIterator *bi,
3763                                     int expected[],
3764                                     int expectedcount)
3765 {
3766     int count = 0;
3767     int i = 0;
3768     int forward[50];
3769     bi->setText(ustr);
3770     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3771         forward[count] = i;
3772         if (count < expectedcount && expected[count] != i) {
3773             test->errln("break forward test failed: expected %d but got %d",
3774                         expected[count], i);
3775             break;
3776         }
3777         count ++;
3778     }
3779     if (count != expectedcount) {
3780         printStringBreaks(ustr, expected, expectedcount);
3781         test->errln("break forward test failed: missed %d match",
3782                     expectedcount - count);
3783         return;
3784     }
3785     // testing boundaries
3786     for (i = 1; i < expectedcount; i ++) {
3787         int j = expected[i - 1];
3788         if (!bi->isBoundary(j)) {
3789             printStringBreaks(ustr, expected, expectedcount);
3790             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3791             return;
3792         }
3793         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3794             if (bi->isBoundary(j)) {
3795                 printStringBreaks(ustr, expected, expectedcount);
3796                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3797                 return;
3798             }
3799         }
3800     }
3801
3802     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3803         count --;
3804         if (forward[count] != i) {
3805             printStringBreaks(ustr, expected, expectedcount);
3806             test->errln("happy break test previous() failed: expected %d but got %d",
3807                         forward[count], i);
3808             break;
3809         }
3810     }
3811     if (count != 0) {
3812         printStringBreaks(ustr, expected, expectedcount);
3813         test->errln("break test previous() failed: missed a match");
3814         return;
3815     }
3816
3817     // testing preceding
3818     for (i = 0; i < expectedcount - 1; i ++) {
3819         // int j = expected[i] + 1;
3820         int j = ustr.moveIndex32(expected[i], 1);
3821         for (; j <= expected[i + 1]; j ++) {
3822             if (bi->preceding(j) != expected[i]) {
3823                 printStringBreaks(ustr, expected, expectedcount);
3824                 test->errln("preceding(): Not expecting boundary at position %d", j);
3825                 return;
3826             }
3827         }
3828     }
3829 }
3830 #endif
3831
3832 void RBBITest::TestWordBreaks(void)
3833 {
3834 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3835
3836     Locale        locale("en");
3837     UErrorCode    status = U_ZERO_ERROR;
3838     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3839     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3840     // Replaced any C+J characters in a row with a random sequence of characters
3841     // of the same length to make our C+J segmentation not get in the way.
3842     static const char *strlist[] =
3843     {
3844     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3845     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3846     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3847     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3848     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3849     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3850     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3851     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3852     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3853     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3854     "\\u2027\\U000e0067\\u0a47\\u00b7",
3855     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3856     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3857     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3858     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3859     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3860     "\\u0027\\u11af\\U000e0057\\u0602",
3861     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3862     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3863     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3864     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3865     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3866     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3867     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3868     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3869     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3870     "\\u18f4\\U000e0049\\u20e7\\u2027",
3871     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3872     "\\ua183\\u102d\\u0bec\\u003a",
3873     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3874     "\\u003a\\u0e57\\u0fad\\u002e",
3875     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3876     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3877     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3878     "\\u003a\\u0664\\u00b7\\u1fba",
3879     "\\u003b\\u0027\\u00b7\\u47a3",
3880     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3881     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3882     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3883     };
3884     int loop;
3885     if (U_FAILURE(status)) {
3886         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3887         return;
3888     }
3889     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3890         // printf("looping %d\n", loop);
3891         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3892         // RBBICharMonkey monkey;
3893         RBBIWordMonkey monkey;
3894
3895         int expected[50];
3896         int expectedcount = 0;
3897
3898         monkey.setText(ustr);
3899         int i;
3900         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3901             expected[expectedcount ++] = i;
3902         }
3903
3904         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3905     }
3906     delete bi;
3907 #endif
3908 }
3909
3910 void RBBITest::TestWordBoundary(void)
3911 {
3912     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3913     Locale        locale("en");
3914     UErrorCode    status = U_ZERO_ERROR;
3915     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3916     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3917     UChar         str[50];
3918     static const char *strlist[] =
3919     {
3920     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3921     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3922     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3923     "\\u2027\\U000e0067\\u0a47\\u00b7",
3924     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3925     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3926     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3927     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3928     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3929     "\\u0027\\u11af\\U000e0057\\u0602",
3930     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3931     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3932     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3933     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3934     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3935     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3936     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3937     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3938     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3939     "\\u58f4\\U000e0049\\u20e7\\u2027",
3940     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3941     "\\ua183\\u102d\\u0bec\\u003a",
3942     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3943     "\\u003a\\u0e57\\u0fad\\u002e",
3944     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3945     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3946     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3947     "\\u003a\\u0664\\u00b7\\u1fba",
3948     "\\u003b\\u0027\\u00b7\\u47a3",
3949     };
3950     int loop;
3951     if (U_FAILURE(status)) {
3952         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3953         return;
3954     }
3955     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3956         // printf("looping %d\n", loop);
3957         u_unescape(strlist[loop], str, 20);
3958         UnicodeString ustr(str);
3959         int forward[50];
3960         int count = 0;
3961
3962         bi->setText(ustr);
3963         int prev = 0;
3964         int i;
3965         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3966             forward[count ++] = i;
3967             if (i > prev) {
3968                 int j;
3969                 for (j = prev + 1; j < i; j ++) {
3970                     if (bi->isBoundary(j)) {
3971                         printStringBreaks(ustr, forward, count);
3972                         errln("happy boundary test failed: expected %d not a boundary",
3973                                j);
3974                         return;
3975                     }
3976                 }
3977             }
3978             if (!bi->isBoundary(i)) {
3979                 printStringBreaks(ustr, forward, count);
3980                 errln("happy boundary test failed: expected %d a boundary",
3981                        i);
3982                 return;
3983             }
3984             prev = i;
3985         }
3986     }
3987     delete bi;
3988 }
3989
3990 void RBBITest::TestLineBreaks(void)
3991 {
3992 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3993     Locale        locale("en");
3994     UErrorCode    status = U_ZERO_ERROR;
3995     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3996     const int32_t  STRSIZE = 50;
3997     UChar         str[STRSIZE];
3998     static const char *strlist[] =
3999     {
4000      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4001      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4002              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4003      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4004              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4005      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4006      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4007      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4008      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4009      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4010      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4011      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4012      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4013      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4014      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4015      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4016      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4017      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4018      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4019      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4020      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4021      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4022      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4023      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4024      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4025      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4026      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4027      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4028      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4029      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4030      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4031      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4032      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4033      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4034      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4035      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4036      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4037      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4038      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4039          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4040     };
4041     int loop;
4042     TEST_ASSERT_SUCCESS(status);
4043     if (U_FAILURE(status)) {
4044         return;
4045     }
4046     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4047         // printf("looping %d\n", loop);
4048         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4049         if (t >= STRSIZE) {
4050             TEST_ASSERT(FALSE);
4051             continue;
4052         }
4053
4054
4055         UnicodeString ustr(str);
4056         RBBILineMonkey monkey;
4057         if (U_FAILURE(monkey.deferredStatus)) {
4058             continue;
4059         }
4060
4061         const int EXPECTEDSIZE = 50;
4062         int expected[EXPECTEDSIZE];
4063         int expectedcount = 0;
4064
4065         monkey.setText(ustr);
4066         int i;
4067         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4068             if (expectedcount >= EXPECTEDSIZE) {
4069                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4070                 return;
4071             }
4072             expected[expectedcount ++] = i;
4073         }
4074
4075         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4076     }
4077     delete bi;
4078 #endif
4079 }
4080
4081 void RBBITest::TestSentBreaks(void)
4082 {
4083 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4084     Locale        locale("en");
4085     UErrorCode    status = U_ZERO_ERROR;
4086     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4087     UChar         str[200];
4088     static const char *strlist[] =
4089     {
4090      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4091      "This\n",
4092      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4093      "\"Sentence ending with a quote.\" Bye.",
4094      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4095      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4096      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4097      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4098      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4099      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4100      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4101              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4102              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4103              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4104      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4105              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4106              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4107              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4108              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4109              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4110     };
4111     int loop;
4112     if (U_FAILURE(status)) {
4113         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4114         return;
4115     }
4116     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4117         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4118         UnicodeString ustr(str);
4119
4120         RBBISentMonkey monkey;
4121         if (U_FAILURE(monkey.deferredStatus)) {
4122             continue;
4123         }
4124
4125         const int EXPECTEDSIZE = 50;
4126         int expected[EXPECTEDSIZE];
4127         int expectedcount = 0;
4128
4129         monkey.setText(ustr);
4130         int i;
4131         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4132             if (expectedcount >= EXPECTEDSIZE) {
4133                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4134                 return;
4135             }
4136             expected[expectedcount ++] = i;
4137         }
4138
4139         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4140     }
4141     delete bi;
4142 #endif
4143 }
4144
4145 void RBBITest::TestMonkey(char *params) {
4146 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4147
4148     UErrorCode     status    = U_ZERO_ERROR;
4149     int32_t        loopCount = 500;
4150     int32_t        seed      = 1;
4151     UnicodeString  breakType = "all";
4152     Locale         locale("en");
4153     UBool          useUText  = FALSE;
4154
4155     if (quick == FALSE) {
4156         loopCount = 10000;
4157     }
4158
4159     if (params) {
4160         UnicodeString p(params);
4161         loopCount = getIntParam("loop", p, loopCount);
4162         seed      = getIntParam("seed", p, seed);
4163
4164         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4165         if (m.find()) {
4166             breakType = m.group(1, status);
4167             m.reset();
4168             p = m.replaceFirst("", status);
4169         }
4170
4171         RegexMatcher u(" *utext", p, 0, status);
4172         if (u.find()) {
4173             useUText = TRUE;
4174             u.reset();
4175             p = u.replaceFirst("", status);
4176         }
4177
4178
4179         // m.reset(p);
4180         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4181             // Each option is stripped out of the option string as it is processed.
4182             // All options have been checked.  The option string should have been completely emptied..
4183             char buf[100];
4184             p.extract(buf, sizeof(buf), NULL, status);
4185             buf[sizeof(buf)-1] = 0;
4186             errln("Unrecognized or extra parameter:  %s\n", buf);
4187             return;
4188         }
4189
4190     }
4191
4192     if (breakType == "char" || breakType == "all") {
4193         RBBICharMonkey  m;
4194         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4195         if (U_SUCCESS(status)) {
4196             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4197             if (breakType == "all" && useUText==FALSE) {
4198                 // Also run a quick test with UText when "all" is specified
4199                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4200             }
4201         }
4202         else {
4203             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4204         }
4205         delete bi;
4206     }
4207
4208     if (breakType == "word" || breakType == "all") {
4209         logln("Word Break Monkey Test");
4210         RBBIWordMonkey  m;
4211         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4212         if (U_SUCCESS(status)) {
4213             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4214         }
4215         else {
4216             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4217         }
4218         delete bi;
4219     }
4220
4221     if (breakType == "line" || breakType == "all") {
4222         logln("Line Break Monkey Test");
4223         RBBILineMonkey  m;
4224         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4225         if (loopCount >= 10) {
4226             loopCount = loopCount / 5;   // Line break runs slower than the others.
4227         }
4228         if (U_SUCCESS(status)) {
4229             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4230         }
4231         else {
4232             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4233         }
4234         delete bi;
4235     }
4236
4237     if (breakType == "sent" || breakType == "all"  ) {
4238         logln("Sentence Break Monkey Test");
4239         RBBISentMonkey  m;
4240         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4241         if (loopCount >= 10) {
4242             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4243         }
4244         if (U_SUCCESS(status)) {
4245             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4246         }
4247         else {
4248             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4249         }
4250         delete bi;
4251     }
4252
4253 #endif
4254 }
4255
4256 //
4257 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4258 //    Parameters:
4259 //       bi      - the break iterator to use
4260 //       mk      - MonkeyKind, abstraction for obtaining expected results
4261 //       name    - Name of test (char, word, etc.) for use in error messages
4262 //       seed    - Seed for starting random number generator (parameter from user)
4263 //       numIterations
4264 //
4265 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4266                          int32_t numIterations, UBool useUText) {
4267
4268 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4269
4270     const int32_t    TESTSTRINGLEN = 500;
4271     UnicodeString    testText;
4272     int32_t          numCharClasses;
4273     UVector          *chClasses;
4274     int              expected[TESTSTRINGLEN*2 + 1];
4275     int              expectedCount = 0;
4276     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4277     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4278     char             reverseBreaks[TESTSTRINGLEN*2+1];
4279     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4280     char             followingBreaks[TESTSTRINGLEN*2+1];
4281     char             precedingBreaks[TESTSTRINGLEN*2+1];
4282     int              i;
4283     int              loopCount = 0;
4284
4285     m_seed = seed;
4286
4287     numCharClasses = mk.charClasses()->size();
4288     chClasses      = mk.charClasses();
4289
4290     // Check for errors that occured during the construction of the MonkeyKind object.
4291     //  Can't report them where they occured because errln() is a method coming from intlTest,
4292     //  and is not visible outside of RBBITest :-(
4293     if (U_FAILURE(mk.deferredStatus)) {
4294         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4295         return;
4296     }
4297
4298     // Verify that the character classes all have at least one member.
4299     for (i=0; i<numCharClasses; i++) {
4300         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4301         if (s == NULL || s->size() == 0) {
4302             errln("Character Class #%d is null or of zero size.", i);
4303             return;
4304         }
4305     }
4306
4307     while (loopCount < numIterations || numIterations == -1) {
4308         if (numIterations == -1 && loopCount % 10 == 0) {
4309             // If test is running in an infinite loop, display a periodic tic so
4310             //   we can tell that it is making progress.
4311             fprintf(stderr, ".");
4312         }
4313         // Save current random number seed, so that we can recreate the random numbers
4314         //   for this loop iteration in event of an error.
4315         seed = m_seed;
4316
4317         // Populate a test string with data.
4318         testText.truncate(0);
4319         for (i=0; i<TESTSTRINGLEN; i++) {
4320             int32_t  aClassNum = m_rand() % numCharClasses;
4321             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4322             int32_t   charIdx = m_rand() % classSet->size();
4323             UChar32   c = classSet->charAt(charIdx);
4324             if (c < 0) {   // TODO:  deal with sets containing strings.
4325                 errln("%s:%d c < 0", __FILE__, __LINE__);
4326                 break;
4327             }
4328             // Do not assemble a supplementary character from randomly generated separate surrogates.
4329             //   (It could be a dictionary character)
4330             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4331                 continue;
4332             }
4333
4334             testText.append(c);
4335         }
4336
4337         // Calculate the expected results for this test string.
4338         mk.setText(testText);
4339         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4340         expectedBreaks[0] = 1;
4341         int32_t breakPos = 0;
4342         expectedCount = 0;
4343         for (;;) {
4344             breakPos = mk.next(breakPos);
4345             if (breakPos == -1) {
4346                 break;
4347             }
4348             if (breakPos > testText.length()) {
4349                 errln("breakPos > testText.length()");
4350             }
4351             expectedBreaks[breakPos] = 1;
4352             U_ASSERT(expectedCount<testText.length());
4353             expected[expectedCount ++] = breakPos;
4354             (void)expected;   // Set but not used warning.
4355                               // TODO (andy): check it out.
4356         }
4357
4358         // Find the break positions using forward iteration
4359         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4360         if (useUText) {
4361             UErrorCode status = U_ZERO_ERROR;
4362             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4363             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4364             bi->setText(testUText, status);
4365             TEST_ASSERT_SUCCESS(status);
4366             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4367                                       //  This UText can be closed immediately, so long as the
4368                                       //  testText string continues to exist.
4369         } else {
4370             bi->setText(testText);
4371         }
4372
4373         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4374             if (i < 0 || i > testText.length()) {
4375                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4376                 break;
4377             }
4378             forwardBreaks[i] = 1;
4379         }
4380
4381         // Find the break positions using reverse iteration
4382         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4383         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4384             if (i < 0 || i > testText.length()) {
4385                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4386                 break;
4387             }
4388             reverseBreaks[i] = 1;
4389         }
4390
4391         // Find the break positions using isBoundary() tests.
4392         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4393         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4394         for (i=0; i<=testText.length(); i++) {
4395             isBoundaryBreaks[i] = bi->isBoundary(i);
4396         }
4397
4398
4399         // Find the break positions using the following() function.
4400         // printf(".");
4401         memset(followingBreaks, 0, sizeof(followingBreaks));
4402         int32_t   lastBreakPos = 0;
4403         followingBreaks[0] = 1;
4404         for (i=0; i<testText.length(); i++) {
4405             breakPos = bi->following(i);
4406             if (breakPos <= i ||
4407                 breakPos < lastBreakPos ||
4408                 breakPos > testText.length() ||
4409                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4410                 errln("%s break monkey test: "
4411                     "Out of range value returned by BreakIterator::following().\n"
4412                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4413                          name, seed, i, breakPos, lastBreakPos);
4414                 break;
4415             }
4416             followingBreaks[breakPos] = 1;
4417             lastBreakPos = breakPos;
4418         }
4419
4420         // Find the break positions using the preceding() function.
4421         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4422         lastBreakPos = testText.length();
4423         precedingBreaks[testText.length()] = 1;
4424         for (i=testText.length(); i>0; i--) {
4425             breakPos = bi->preceding(i);
4426             if (breakPos >= i ||
4427                 breakPos > lastBreakPos ||
4428                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4429                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4430                 errln("%s break monkey test: "
4431                     "Out of range value returned by BreakIterator::preceding().\n"
4432                     "index=%d;  prev returned %d; lastBreak=%d" ,
4433                     name,  i, breakPos, lastBreakPos);
4434                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4435                     precedingBreaks[i] = 2;   // Forces an error.
4436                 }
4437             } else {
4438                 if (breakPos >= 0) {
4439                     precedingBreaks[breakPos] = 1;
4440                 }
4441                 lastBreakPos = breakPos;
4442             }
4443         }
4444
4445         // Compare the expected and actual results.
4446         for (i=0; i<=testText.length(); i++) {
4447             const char *errorType = NULL;
4448             if  (forwardBreaks[i] != expectedBreaks[i]) {
4449                 errorType = "next()";
4450             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4451                 errorType = "previous()";
4452             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4453                 errorType = "isBoundary()";
4454             } else if (followingBreaks[i] != expectedBreaks[i]) {
4455                 errorType = "following()";
4456             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4457                 errorType = "preceding()";
4458             }
4459
4460
4461             if (errorType != NULL) {
4462                 // Format a range of the test text that includes the failure as
4463                 //  a data item that can be included in the rbbi test data file.
4464
4465                 // Start of the range is the last point where expected and actual results
4466                 //   both agreed that there was a break position.
4467                 int startContext = i;
4468                 int32_t count = 0;
4469                 for (;;) {
4470                     if (startContext==0) { break; }
4471                     startContext --;
4472                     if (expectedBreaks[startContext] != 0) {
4473                         if (count == 2) break;
4474                         count ++;
4475                     }
4476                 }
4477
4478                 // End of range is two expected breaks past the start position.
4479                 int endContext = i + 1;
4480                 int ci;
4481                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4482                     for (;;) {
4483                         if (endContext >= testText.length()) {break;}
4484                         if (expectedBreaks[endContext-1] != 0) {
4485                             if (count == 0) break;
4486                             count --;
4487                         }
4488                         endContext ++;
4489                     }
4490                 }
4491
4492                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4493                 UnicodeString errorText = "<data>";
4494                 /***if (strcmp(errorType, "next()") == 0) {
4495                     startContext = 0;
4496                     endContext = testText.length();
4497
4498                     printStringBreaks(testText, expected, expectedCount);
4499                 }***/
4500
4501                 for (ci=startContext; ci<endContext;) {
4502                     UnicodeString hexChars("0123456789abcdef");
4503                     UChar32  c;
4504                     int      bn;
4505                     c = testText.char32At(ci);
4506                     if (ci == i) {
4507                         // This is the location of the error.
4508                         errorText.append("<?>");
4509                     } else if (expectedBreaks[ci] != 0) {
4510                         // This a non-error expected break position.
4511                         errorText.append("\\");
4512                     }
4513                     if (c < 0x10000) {
4514                         errorText.append("\\u");
4515                         for (bn=12; bn>=0; bn-=4) {
4516                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4517                         }
4518                     } else {
4519                         errorText.append("\\U");
4520                         for (bn=28; bn>=0; bn-=4) {
4521                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4522                         }
4523                     }
4524                     ci = testText.moveIndex32(ci, 1);
4525                 }
4526                 errorText.append("\\");
4527                 errorText.append("</data>\n");
4528
4529                 // Output the error
4530                 char  charErrorTxt[500];
4531                 UErrorCode status = U_ZERO_ERROR;
4532                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4533                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4534                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4535
4536                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4537                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4538                     errorType, seed, i, charErrorTxt);
4539                 break;
4540             }
4541         }
4542
4543         loopCount++;
4544     }
4545 #endif
4546 }
4547
4548
4549 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4550 //             This test checks the initial patch,
4551 //             which is to just keep it from crashing.  Correct word boundaries
4552 //             await a proper fix to the dictionary code.
4553 //
4554 void RBBITest::TestBug5532(void)  {
4555    // Text includes a mixture of Thai and Latin.
4556    const unsigned char utf8Data[] = {
4557            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4558            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4559            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4560            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4561            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4562            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4563            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4564            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4565            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4566            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4567            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4568
4569     UErrorCode status = U_ZERO_ERROR;
4570     UText utext=UTEXT_INITIALIZER;
4571     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4572     TEST_ASSERT_SUCCESS(status);
4573
4574     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4575     TEST_ASSERT_SUCCESS(status);
4576     if (U_SUCCESS(status)) {
4577         bi->setText(&utext, status);
4578         TEST_ASSERT_SUCCESS(status);
4579
4580         int32_t breakCount = 0;
4581         int32_t previousBreak = -1;
4582         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4583             // For now, just make sure that the break iterator doesn't hang.
4584             TEST_ASSERT(previousBreak < bi->current());
4585             previousBreak = bi->current();
4586         }
4587         TEST_ASSERT(breakCount > 0);
4588     }
4589     delete bi;
4590     utext_close(&utext);
4591 }
4592
4593
4594 void RBBITest::TestBug9983(void)  {
4595     UnicodeString text = UnicodeString("\\u002A"  // * Other
4596                                        "\\uFF65"  //   Other
4597                                        "\\u309C"  //   Katakana
4598                                        "\\uFF9F"  //   Extend
4599                                        "\\uFF65"  //   Other
4600                                        "\\u0020"  //   Other
4601                                        "\\u0000").unescape();
4602
4603     UErrorCode status = U_ZERO_ERROR;
4604     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4605         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4606     TEST_ASSERT_SUCCESS(status);
4607     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4608         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4609     TEST_ASSERT_SUCCESS(status);
4610     if (U_FAILURE(status)) {
4611         return;
4612     }
4613     int32_t offset, rstatus, iterationCount;
4614
4615     brkiter->setText(text);
4616     brkiter->last();
4617     iterationCount = 0;
4618     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4619         iterationCount++;
4620         rstatus = brkiter->getRuleStatus();
4621         (void)rstatus;     // Suppress set but not used warning.
4622         if (iterationCount >= 10) {
4623            break;
4624         }
4625     }
4626     TEST_ASSERT(iterationCount == 6);
4627
4628     brkiterPOSIX->setText(text);
4629     brkiterPOSIX->last();
4630     iterationCount = 0;
4631     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4632         iterationCount++;
4633         rstatus = brkiterPOSIX->getRuleStatus();
4634         (void)rstatus;     // Suppress set but not used warning.
4635         if (iterationCount >= 10) {
4636            break;
4637         }
4638     }
4639     TEST_ASSERT(iterationCount == 6);
4640 }
4641
4642
4643 //
4644 //  TestDebug    -  A place-holder test for debugging purposes.
4645 //                  For putting in fragments of other tests that can be invoked
4646 //                  for tracing  without a lot of unwanted extra stuff happening.
4647 //
4648 void RBBITest::TestDebug(void) {
4649 #if 0
4650     UErrorCode   status = U_ZERO_ERROR;
4651     int pos = 0;
4652     int ruleStatus = 0;
4653
4654     RuleBasedBreakIterator* bi =
4655        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4656        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4657        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4658     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4659     // UnicodeString s("Aaa.  Bcd");
4660     s = s.unescape();
4661     bi->setText(s);
4662     UBool r = bi->isBoundary(8);
4663     printf("%s", r?"true":"false");
4664     return;
4665     pos = bi->last();
4666     do {
4667         // ruleStatus = bi->getRuleStatus();
4668         printf("%d\t%d\n", pos, ruleStatus);
4669         pos = bi->previous();
4670     } while (pos != BreakIterator::DONE);
4671 #endif
4672 }
4673
4674 void RBBITest::TestProperties() {
4675     UErrorCode errorCode = U_ZERO_ERROR;
4676     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4677     if (!prependSet.isEmpty()) {
4678         errln(
4679             "[:GCB=Prepend:] is not empty any more. "
4680             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4681             "change this test to the opposite condition.");
4682     }
4683 }
4684
4685 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */