icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2016, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "unicode/utypes.h"
  13 #if !UCONFIG_NO_BREAK_ITERATION
  14
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17 #include <string.h>
  18
  19 #include "unicode/brkiter.h"
  20 #include "unicode/localpointer.h"
  21 #include "unicode/numfmt.h"
  22 #include "unicode/rbbi.h"
  23 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  24 #include "unicode/regex.h"
  25 #endif
  26 #include "unicode/schriter.h"
  27 #include "unicode/uchar.h"
  28 #include "unicode/utf16.h"
  29 #include "unicode/ucnv.h"
  30 #include "unicode/uniset.h"
  31 #include "unicode/uscript.h"
  32 #include "unicode/ustring.h"
  33 #include "unicode/utext.h"
  34
  35 #include "charstr.h"
  36 #include "cmemory.h"
  37 #include "intltest.h"
  38 #include "rbbitst.h"
  39 #include "utypeinfo.h"  // for 'typeid' to work
  40 #include "uvector.h"
  41 #include "uvectr32.h"
  42
  43 #if !UCONFIG_NO_FILTERED_BREAK_ITERATION
  44 #include "unicode/filteredbrk.h"
  45 #endif // !UCONFIG_NO_FILTERED_BREAK_ITERATION
  46
  47 #define TEST_ASSERT(x) {if (!(x)) { \
  48     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  49
  50 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  51     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  52
  53
  54 //---------------------------------------------
  55 // runIndexedTest
  56 //---------------------------------------------
  57
  58
  59 //  Note:  Before adding new tests to this file, check whether the desired test data can
  60 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  61 //         it's much less work than writing a new test, diagnostic output in the event of failures
  62 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  63 //         will run there as well, without additional effort.
  64
  65 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  66 {
  67     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  68
  69     switch (index) {
  70 #if !UCONFIG_NO_FILE_IO
  71         case 0: name = "TestBug4153072";
  72             if(exec) TestBug4153072();                         break;
  73 #else
  74         case 0: name = "skip";
  75             break;
  76 #endif
  77
  78         case 1: name = "skip";
  79             break;
  80         case 2: name = "TestStatusReturn";
  81             if(exec) TestStatusReturn();                       break;
  82
  83 #if !UCONFIG_NO_FILE_IO
  84         case 3: name = "TestUnicodeFiles";
  85             if(exec) TestUnicodeFiles();                       break;
  86         case 4: name = "TestEmptyString";
  87             if(exec) TestEmptyString();                        break;
  88 #else
  89         case 3: case 4: name = "skip";
  90             break;
  91 #endif
  92
  93         case 5: name = "TestGetAvailableLocales";
  94             if(exec) TestGetAvailableLocales();                break;
  95
  96         case 6: name = "TestGetDisplayName";
  97             if(exec) TestGetDisplayName();                     break;
  98
  99 #if !UCONFIG_NO_FILE_IO
 100         case 7: name = "TestEndBehaviour";
 101             if(exec) TestEndBehaviour();                       break;
 102         case 8: case 9: case 10: name = "skip";
 103              break;
 104         case 11: name = "TestWordBreaks";
 105              if(exec) TestWordBreaks();                        break;
 106         case 12: name = "TestWordBoundary";
 107              if(exec) TestWordBoundary();                      break;
 108         case 13: name = "TestLineBreaks";
 109              if(exec) TestLineBreaks();                        break;
 110         case 14: name = "TestSentBreaks";
 111              if(exec) TestSentBreaks();                        break;
 112         case 15: name = "TestExtended";
 113              if(exec) TestExtended();                          break;
 114 #else
 115         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 116              break;
 117 #endif
 118
 119 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 120         case 16:
 121             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
 122 #else
 123         case 16:
 124              name = "skip";                                    break;
 125 #endif
 126
 127 #if !UCONFIG_NO_FILE_IO
 128         case 17: name = "TestBug3818";
 129             if(exec) TestBug3818();                            break;
 130 #else
 131         case 17: name = "skip";
 132             break;
 133 #endif
 134
 135         case 18: name = "skip";
 136             break;
 137         case 19: name = "TestDebug";
 138             if(exec) TestDebug();                              break;
 139         case 20: name = "skip";
 140             break;
 141
 142 #if !UCONFIG_NO_FILE_IO
 143         case 21: name = "TestBug5775";
 144             if (exec) TestBug5775();                           break;
 145 #else
 146         case 21: name = "skip";
 147             break;
 148 #endif
 149
 150         case 22: name = "TestBug9983";
 151             if (exec) TestBug9983();                           break;
 152         case 23: name = "TestDictRules";
 153             if (exec) TestDictRules();                         break;
 154         case 24: name = "TestBug5532";
 155             if (exec) TestBug5532();                           break;
 156         default: name = ""; break; //needed to end loop
 157     }
 158 }
 159
 160
 161 //---------------------------------------------------------------------------
 162 //
 163 //   class BITestData   Holds a set of Break iterator test data and results
 164 //                      Includes
 165 //                         - the string data to be broken
 166 //                         - a vector of the expected break positions.
 167 //                         - a vector of source line numbers for the data,
 168 //                               (to help see where errors occured.)
 169 //                         - The expected break tag values.
 170 //                         - Vectors of actual break positions and tag values.
 171 //                         - Functions for comparing actual with expected and
 172 //                            reporting errors.
 173 //
 174 //----------------------------------------------------------------------------
 175 class BITestData {
 176 public:
 177     UnicodeString    fDataToBreak;
 178     UVector          fExpectedBreakPositions;
 179     UVector          fExpectedTags;
 180     UVector          fLineNum;
 181     UVector          fActualBreakPositions;   // Test Results.
 182     UVector          fActualTags;
 183
 184     BITestData(UErrorCode &status);
 185     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 186     void             checkResults(const char *heading, RBBITest *test);
 187     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 188     void             clearResults();
 189 };
 190
 191 //
 192 // Constructor.
 193 //
 194 BITestData::BITestData(UErrorCode &status)
 195 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 196   fActualTags(status)
 197 {
 198 }
 199
 200 //
 201 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 202 //                 The macro form collects the line number, which is helpful
 203 //                 when tracking down failures.
 204 //
 205 //                 A null data item is inserted at the start of each test's data
 206 //                  to put the starting zero into the data list.  The position saved for
 207 //                  each non-null item is its ending position.
 208 //
 209 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 210 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 211     if (U_FAILURE(status)) {return;}
 212     if (data != NULL) {
 213         fDataToBreak.append(CharsToUnicodeString(data));
 214     }
 215     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 216     fExpectedTags.addElement(tag, status);
 217     fLineNum.addElement(lineNum, status);
 218 }
 219
 220
 221 //
 222 //  checkResults.   Compare the actual and expected break positions, report any differences.
 223 //
 224 void BITestData::checkResults(const char *heading, RBBITest *test) {
 225     int32_t   expectedIndex = 0;
 226     int32_t   actualIndex = 0;
 227
 228     for (;;) {
 229         // If we've run through both the expected and actual results vectors, we're done.
 230         //   break out of the loop.
 231         if (expectedIndex >= fExpectedBreakPositions.size() &&
 232             actualIndex   >= fActualBreakPositions.size()) {
 233             break;
 234         }
 235
 236
 237         if (expectedIndex >= fExpectedBreakPositions.size()) {
 238             err(heading, test, expectedIndex-1, actualIndex);
 239             actualIndex++;
 240             continue;
 241         }
 242
 243         if (actualIndex >= fActualBreakPositions.size()) {
 244             err(heading, test, expectedIndex, actualIndex-1);
 245             expectedIndex++;
 246             continue;
 247         }
 248
 249         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 250             err(heading, test, expectedIndex, actualIndex);
 251             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 252             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 253                 actualIndex++;
 254             } else {
 255                 expectedIndex++;
 256             }
 257             continue;
 258         }
 259
 260         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 261             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 262                 heading, fLineNum.elementAt(expectedIndex),
 263                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 264         }
 265
 266         actualIndex++;
 267         expectedIndex++;
 268     }
 269 }
 270
 271 //
 272 //  err   -  An error was found.  Report it, along with information about where the
 273 //                                incorrectly broken test data appeared in the source file.
 274 //
 275 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 276 {
 277     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 278     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 279     int32_t   o        = 0;
 280     int32_t   line     = fLineNum.elementAti(expectedIdx);
 281     if (expectedIdx > 0) {
 282         // The line numbers are off by one because a premature break occurs somewhere
 283         //    within the previous item, rather than at the start of the current (expected) item.
 284         //    We want to report the offset of the unexpected break from the start of
 285         //      this previous item.
 286         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 287     }
 288     if (actual < expected) {
 289         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 290     } else {
 291         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 292     }
 293 }
 294
 295
 296 void BITestData::clearResults() {
 297     fActualBreakPositions.removeAllElements();
 298     fActualTags.removeAllElements();
 299 }
 300
 301
 302 //--------------------------------------------------------------------------------------
 303 //
 304 //    RBBITest    constructor and destructor
 305 //
 306 //--------------------------------------------------------------------------------------
 307
 308 RBBITest::RBBITest() {
 309 }
 310
 311
 312 RBBITest::~RBBITest() {
 313 }
 314
 315 //-----------------------------------------------------------------------------------
 316 //
 317 //   Test for status {tag} return value from break rules.
 318 //        TODO:  a more thorough test.
 319 //
 320 //-----------------------------------------------------------------------------------
 321 void RBBITest::TestStatusReturn() {
 322      UnicodeString rulesString1("$Letters = [:L:];\n"
 323                                   "$Numbers = [:N:];\n"
 324                                   "$Letters+{1};\n"
 325                                   "$Numbers+{2};\n"
 326                                   "Help\\ /me\\!{4};\n"
 327                                   "[^$Letters $Numbers];\n"
 328                                   "!.*;\n", -1, US_INV);
 329      UnicodeString testString1  = "abc123..abc Help me Help me!";
 330                                 // 01234567890123456789012345678
 331      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 332      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 333
 334      UErrorCode status=U_ZERO_ERROR;
 335      UParseError    parseError;
 336
 337      LocalPointer <BreakIterator> bi(new RuleBasedBreakIterator(rulesString1, parseError, status));
 338      if(U_FAILURE(status)) {
 339          dataerrln("%s:%d error in break iterator construction - %s", __FILE__, __LINE__,  u_errorName(status));
 340          return;
 341      }
 342      int32_t  pos;
 343      int32_t  i = 0;
 344      bi->setText(testString1);
 345      for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 346          if (pos != bounds1[i]) {
 347              errln("%s:%d  expected break at %d, got %d\n", __FILE__, __LINE__, bounds1[i], pos);
 348              break;
 349          }
 350
 351          int tag = bi->getRuleStatus();
 352          if (tag != brkStatus[i]) {
 353              errln("%s:%d  break at %d, expected tag %d, got tag %d\n", __FILE__, __LINE__, pos, brkStatus[i], tag);
 354              break;
 355          }
 356          i++;
 357      }
 358 }
 359
 360
 361 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 362     UErrorCode status = U_ZERO_ERROR;
 363     char name[100];
 364     printf("code    alpha extend alphanum type word sent line name\n");
 365     int nextExpectedIndex = 0;
 366     utext_setNativeIndex(tstr, 0);
 367     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 368         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 369             printf("------------------------------------------------ %d\n", j);
 370             ++nextExpectedIndex;
 371         }
 372
 373         UChar32 c = utext_next32(tstr);
 374         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 375         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 376                            u_isUAlphabetic(c),
 377                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 378                            u_isalnum(c),
 379                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 380                                                   u_charType(c),
 381                                                   U_SHORT_PROPERTY_NAME),
 382                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 383                                                   u_getIntPropertyValue(c,
 384                                                           UCHAR_WORD_BREAK),
 385                                                   U_SHORT_PROPERTY_NAME),
 386                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 387                                    u_getIntPropertyValue(c,
 388                                            UCHAR_SENTENCE_BREAK),
 389                                    U_SHORT_PROPERTY_NAME),
 390                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 391                                    u_getIntPropertyValue(c,
 392                                            UCHAR_LINE_BREAK),
 393                                    U_SHORT_PROPERTY_NAME),
 394                            name);
 395     }
 396 }
 397
 398
 399 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 400    UErrorCode status = U_ZERO_ERROR;
 401    UText *tstr = NULL;
 402    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 403    if (U_FAILURE(status)) {
 404        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 405        return;
 406     }
 407    printStringBreaks(tstr, expected, expectedCount);
 408    utext_close(tstr);
 409 }
 410
 411
 412 void RBBITest::TestBug3818() {
 413     UErrorCode  status = U_ZERO_ERROR;
 414
 415     // Four Thai words...
 416     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 417                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 418     UnicodeString  thaiStr(thaiWordData);
 419
 420     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 421     if (U_FAILURE(status) || bi == NULL) {
 422         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 423         return;
 424     }
 425     bi->setText(thaiStr);
 426
 427     int32_t  startOfSecondWord = bi->following(1);
 428     if (startOfSecondWord != 4) {
 429         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 430             __FILE__, __LINE__, startOfSecondWord);
 431     }
 432     startOfSecondWord = bi->following(0);
 433     if (startOfSecondWord != 4) {
 434         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 435             __FILE__, __LINE__, startOfSecondWord);
 436     }
 437     delete bi;
 438 }
 439
 440 //----------------------------------------------------------------------------
 441 //
 442 // generalIteratorTest      Given a break iterator and a set of test data,
 443 //                          Run the tests and report the results.
 444 //
 445 //----------------------------------------------------------------------------
 446 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 447 {
 448
 449     bi.setText(td.fDataToBreak);
 450
 451     testFirstAndNext(bi, td);
 452
 453     testLastAndPrevious(bi, td);
 454
 455     testFollowing(bi, td);
 456     testPreceding(bi, td);
 457     testIsBoundary(bi, td);
 458     doMultipleSelectionTest(bi, td);
 459 }
 460
 461
 462 //
 463 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 464 //                       kind of loop.
 465 //
 466 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 467 {
 468     UErrorCode  status = U_ZERO_ERROR;
 469     int32_t     p;
 470     int32_t     lastP = -1;
 471     int32_t     tag;
 472
 473     logln("Test first and next");
 474     bi.setText(td.fDataToBreak);
 475     td.clearResults();
 476
 477     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 478         td.fActualBreakPositions.addElement(p, status);  // Save result.
 479         tag = bi.getRuleStatus();
 480         td.fActualTags.addElement(tag, status);
 481         if (p <= lastP) {
 482             // If the iterator is not making forward progress, stop.
 483             //  No need to raise an error here, it'll be detected in the normal check of results.
 484             break;
 485         }
 486         lastP = p;
 487     }
 488     td.checkResults("testFirstAndNext", this);
 489 }
 490
 491
 492 //
 493 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 494 //
 495 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 496 {
 497     UErrorCode  status = U_ZERO_ERROR;
 498     int32_t     p;
 499     int32_t     lastP  = 0x7ffffffe;
 500     int32_t     tag;
 501
 502     logln("Test last and previous");
 503     bi.setText(td.fDataToBreak);
 504     td.clearResults();
 505
 506     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 507         // Save break position.  Insert it at start of vector of results, shoving
 508         //    already-saved results further towards the end.
 509         td.fActualBreakPositions.insertElementAt(p, 0, status);
 510         // bi.previous();   // TODO:  Why does this fix things up????
 511         // bi.next();
 512         tag = bi.getRuleStatus();
 513         td.fActualTags.insertElementAt(tag, 0, status);
 514         if (p >= lastP) {
 515             // If the iterator is not making progress, stop.
 516             //  No need to raise an error here, it'll be detected in the normal check of results.
 517             break;
 518         }
 519         lastP = p;
 520     }
 521     td.checkResults("testLastAndPrevious", this);
 522 }
 523
 524
 525 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 526 {
 527     UErrorCode  status = U_ZERO_ERROR;
 528     int32_t     p;
 529     int32_t     tag;
 530     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 531                                  //   cannot be -1; that is returned for DONE.
 532     int         i;
 533
 534     logln("testFollowing():");
 535     bi.setText(td.fDataToBreak);
 536     td.clearResults();
 537
 538     // Save the starting point, since we won't get that out of following.
 539     p = bi.first();
 540     td.fActualBreakPositions.addElement(p, status);  // Save result.
 541     tag = bi.getRuleStatus();
 542     td.fActualTags.addElement(tag, status);
 543
 544     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 545         p = bi.following(i);
 546         if (p != lastP) {
 547             if (p == RuleBasedBreakIterator::DONE) {
 548                 break;
 549             }
 550             // We've reached a new break position.  Save it.
 551             td.fActualBreakPositions.addElement(p, status);  // Save result.
 552             tag = bi.getRuleStatus();
 553             td.fActualTags.addElement(tag, status);
 554             lastP = p;
 555         }
 556     }
 557     // The loop normally exits by means of the break in the middle.
 558     // Make sure that the index was at the correct position for the break iterator to have
 559     //   returned DONE.
 560     if (i != td.fDataToBreak.length()) {
 561         errln("testFollowing():  iterator returned DONE prematurely.");
 562     }
 563
 564     // Full check of all results.
 565     td.checkResults("testFollowing", this);
 566 }
 567
 568
 569
 570 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 571     UErrorCode  status = U_ZERO_ERROR;
 572     int32_t     p;
 573     int32_t     tag;
 574     int32_t     lastP  = 0x7ffffffe;
 575     int         i;
 576
 577     logln("testPreceding():");
 578     bi.setText(td.fDataToBreak);
 579     td.clearResults();
 580
 581     p = bi.last();
 582     td.fActualBreakPositions.addElement(p, status);
 583     tag = bi.getRuleStatus();
 584     td.fActualTags.addElement(tag, status);
 585
 586     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 587         p = bi.preceding(i);
 588         if (p != lastP) {
 589             if (p == RuleBasedBreakIterator::DONE) {
 590                 break;
 591             }
 592             // We've reached a new break position.  Save it.
 593             td.fActualBreakPositions.insertElementAt(p, 0, status);
 594             lastP = p;
 595             tag = bi.getRuleStatus();
 596             td.fActualTags.insertElementAt(tag, 0, status);
 597         }
 598     }
 599     // The loop normally exits by means of the break in the middle.
 600     // Make sure that the index was at the correct position for the break iterator to have
 601     //   returned DONE.
 602     if (i != 0) {
 603         errln("testPreceding():  iterator returned DONE prematurely.");
 604     }
 605
 606     // Full check of all results.
 607     td.checkResults("testPreceding", this);
 608 }
 609
 610
 611
 612 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 613     UErrorCode  status = U_ZERO_ERROR;
 614     int         i;
 615     int32_t     tag;
 616
 617     logln("testIsBoundary():");
 618     bi.setText(td.fDataToBreak);
 619     td.clearResults();
 620
 621     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 622         if (bi.isBoundary(i)) {
 623             td.fActualBreakPositions.addElement(i, status);  // Save result.
 624             tag = bi.getRuleStatus();
 625             td.fActualTags.addElement(tag, status);
 626         }
 627     }
 628     td.checkResults("testIsBoundary: ", this);
 629 }
 630
 631
 632
 633 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 634 {
 635     iterator.setText(td.fDataToBreak);
 636
 637     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 638     int32_t offset = iterator.first();
 639     int32_t testOffset;
 640     int32_t count = 0;
 641
 642     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 643
 644     if (*testIterator != iterator)
 645         errln("clone() or operator!= failed: two clones compared unequal");
 646
 647     do {
 648         testOffset = testIterator->first();
 649         testOffset = testIterator->next(count);
 650         if (offset != testOffset)
 651             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 652
 653         if (offset != RuleBasedBreakIterator::DONE) {
 654             count++;
 655             offset = iterator.next();
 656
 657             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 658                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 659                 if (count > 10000 || offset == -1) {
 660                     errln("operator== failed too many times. Stopping test.");
 661                     if (offset == -1) {
 662                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 663                     }
 664                     return;
 665                 }
 666             }
 667         }
 668     } while (offset != RuleBasedBreakIterator::DONE);
 669
 670     // now do it backwards...
 671     offset = iterator.last();
 672     count = 0;
 673
 674     do {
 675         testOffset = testIterator->last();
 676         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 677         if (offset != testOffset)
 678             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 679
 680         if (offset != RuleBasedBreakIterator::DONE) {
 681             count--;
 682             offset = iterator.previous();
 683         }
 684     } while (offset != RuleBasedBreakIterator::DONE);
 685
 686     delete testIterator;
 687 }
 688
 689
 690 //---------------------------------------------
 691 //
 692 //     other tests
 693 //
 694 //---------------------------------------------
 695 void RBBITest::TestEmptyString()
 696 {
 697     UnicodeString text = "";
 698     UErrorCode status = U_ZERO_ERROR;
 699
 700     BITestData x(status);
 701     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 702     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 703     if (U_FAILURE(status))
 704     {
 705         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 706         return;
 707     }
 708     generalIteratorTest(*bi, x);
 709     delete bi;
 710 }
 711
 712 void RBBITest::TestGetAvailableLocales()
 713 {
 714     int32_t locCount = 0;
 715     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 716
 717     if (locCount == 0)
 718         dataerrln("getAvailableLocales() returned an empty list!");
 719     // Just make sure that it's returning good memory.
 720     int32_t i;
 721     for (i = 0; i < locCount; ++i) {
 722         logln(locList[i].getName());
 723     }
 724 }
 725
 726 //Testing the BreakIterator::getDisplayName() function
 727 void RBBITest::TestGetDisplayName()
 728 {
 729     UnicodeString   result;
 730
 731     BreakIterator::getDisplayName(Locale::getUS(), result);
 732     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 733         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 734                 + result);
 735
 736     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 737     if (result != "French (France)")
 738         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 739                 + result);
 740 }
 741 /**
 742  * Test End Behaviour
 743  * @bug 4068137
 744  */
 745 void RBBITest::TestEndBehaviour()
 746 {
 747     UErrorCode status = U_ZERO_ERROR;
 748     UnicodeString testString("boo.");
 749     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 750     if (U_FAILURE(status))
 751     {
 752         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 753         return;
 754     }
 755     wb->setText(testString);
 756
 757     if (wb->first() != 0)
 758         errln("Didn't get break at beginning of string.");
 759     if (wb->next() != 3)
 760         errln("Didn't get break before period in \"boo.\"");
 761     if (wb->current() != 4 && wb->next() != 4)
 762         errln("Didn't get break at end of string.");
 763     delete wb;
 764 }
 765 /*
 766  * @bug 4153072
 767  */
 768 void RBBITest::TestBug4153072() {
 769     UErrorCode status = U_ZERO_ERROR;
 770     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 771     if (U_FAILURE(status))
 772     {
 773         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 774         return;
 775     }
 776     UnicodeString str("...Hello, World!...");
 777     int32_t begin = 3;
 778     int32_t end = str.length() - 3;
 779     UBool onBoundary;
 780
 781     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 782     iter->adoptText(textIterator);
 783     int index;
 784     // Note: with the switch to UText, there is no way to restrict the
 785     //       iteration range to begin at an index other than zero.
 786     //       String character iterators created with a non-zero bound are
 787     //         treated by RBBI as being empty.
 788     for (index = -1; index < begin + 1; ++index) {
 789         onBoundary = iter->isBoundary(index);
 790         if (index == 0?  !onBoundary : onBoundary) {
 791             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 792                             " and begin index = " + begin);
 793         }
 794     }
 795     delete iter;
 796 }
 797
 798
 799 //
 800 // Test for problem reported by Ashok Matoria on 9 July 2007
 801 //    One.<kSoftHyphen><kSpace>Two.
 802 //
 803 //    Sentence break at start (0) and then on calling next() it breaks at
 804 //   'T' of "Two". Now, at this point if I do next() and
 805 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 806 //
 807 void RBBITest::TestBug5775() {
 808     UErrorCode status = U_ZERO_ERROR;
 809     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 810     TEST_ASSERT_SUCCESS(status);
 811     if (U_FAILURE(status)) {
 812         return;
 813     }
 814 // Check for status first for better handling of no data errors.
 815     TEST_ASSERT(bi != NULL);
 816     if (bi == NULL) {
 817         return;
 818     }
 819
 820     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 821     //               01234      56789
 822     s = s.unescape();
 823     bi->setText(s);
 824     int pos = bi->next();
 825     TEST_ASSERT(pos == 6);
 826     pos = bi->next();
 827     TEST_ASSERT(pos == 10);
 828     pos = bi->previous();
 829     TEST_ASSERT(pos == 6);
 830     delete bi;
 831 }
 832
 833
 834
 835 //------------------------------------------------------------------------------
 836 //
 837 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 838 //
 839 //------------------------------------------------------------------------------
 840
 841 struct TestParams {
 842     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 843                                            //   Changed out whenever test data changes break type.
 844
 845     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 846     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 847     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 848     UVector32       *srcCol;
 849
 850     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 851     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 852     CharString       utf8String;           // UTF-8 form of text to break.
 853
 854     TestParams(UErrorCode &status) : dataToBreak() {
 855         bi               = NULL;
 856         expectedBreaks   = new UVector32(status);
 857         srcLine          = new UVector32(status);
 858         srcCol           = new UVector32(status);
 859         textToBreak      = NULL;
 860         textMap          = new UVector32(status);
 861     }
 862
 863     ~TestParams() {
 864         delete bi;
 865         delete expectedBreaks;
 866         delete srcLine;
 867         delete srcCol;
 868         utext_close(textToBreak);
 869         delete textMap;
 870     }
 871
 872     int32_t getSrcLine(int32_t bp);
 873     int32_t getExpectedBreak(int32_t bp);
 874     int32_t getSrcCol(int32_t bp);
 875
 876     void setUTF16(UErrorCode &status);
 877     void setUTF8(UErrorCode &status);
 878 };
 879
 880 // Append a UnicodeString to a CharString with UTF-8 encoding.
 881 // Substitute any invalid chars.
 882 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 883 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 884     if (U_FAILURE(status)) {
 885         return;
 886     }
 887     int32_t utf8Length;
 888     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 889                        src.getBuffer(), src.length(),   // UTF-16 data
 890                        0xfffd, NULL,                    // Substitution char, number of subs.
 891                        &status);
 892     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 893         return;
 894     }
 895     status = U_ZERO_ERROR;
 896     int32_t capacity;
 897     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 898     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 899                        src.getBuffer(), src.length(),
 900                        0xfffd, NULL, &status);
 901     dest.append(buffer, utf8Length, status);
 902 }
 903
 904
 905 void TestParams::setUTF16(UErrorCode &status) {
 906     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 907     textMap->removeAllElements();
 908     for (int32_t i=0; i<dataToBreak.length(); i++) {
 909         if (i == dataToBreak.getChar32Start(i)) {
 910             textMap->addElement(i, status);
 911         } else {
 912             textMap->addElement(-1, status);
 913         }
 914     }
 915     textMap->addElement(dataToBreak.length(), status);
 916     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 917 }
 918
 919
 920 void TestParams::setUTF8(UErrorCode &status) {
 921     if (U_FAILURE(status)) {
 922         return;
 923     }
 924     utf8String.clear();
 925     CharStringAppend(utf8String, dataToBreak, status);
 926     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 927     if (U_FAILURE(status)) {
 928         return;
 929     }
 930
 931     textMap->removeAllElements();
 932     int32_t utf16Index = 0;
 933     for (;;) {
 934         textMap->addElement(utf16Index, status);
 935         UChar32 c32 = utext_current32(textToBreak);
 936         if (c32 < 0) {
 937             break;
 938         }
 939         utf16Index += U16_LENGTH(c32);
 940         utext_next32(textToBreak);
 941         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 942             textMap->addElement(-1, status);
 943         }
 944     }
 945     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 946 }
 947
 948
 949 int32_t TestParams::getSrcLine(int bp) {
 950     if (bp >= textMap->size()) {
 951         bp = textMap->size() - 1;
 952     }
 953     int32_t i = 0;
 954     for(; bp >= 0 ; --bp) {
 955         // Move to a character boundary if we are not on one already.
 956         i = textMap->elementAti(bp);
 957         if (i >= 0) {
 958             break;
 959         }
 960     }
 961     return srcLine->elementAti(i);
 962 }
 963
 964
 965 int32_t TestParams::getExpectedBreak(int bp) {
 966     if (bp >= textMap->size()) {
 967         return 0;
 968     }
 969     int32_t i = textMap->elementAti(bp);
 970     int32_t retVal = 0;
 971     if (i >= 0) {
 972         retVal = expectedBreaks->elementAti(i);
 973     }
 974     return retVal;
 975 }
 976
 977
 978 int32_t TestParams::getSrcCol(int bp) {
 979     if (bp >= textMap->size()) {
 980         bp = textMap->size() - 1;
 981     }
 982     int32_t i = 0;
 983     for(; bp >= 0; --bp) {
 984         // Move bp to a character boundary if we are not on one already.
 985         i = textMap->elementAti(bp);
 986         if (i >= 0) {
 987             break;
 988         }
 989     }
 990     return srcCol->elementAti(i);
 991 }
 992
 993
 994 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 995     int32_t    bp;
 996     int32_t    prevBP;
 997     int32_t    i;
 998
 999     TEST_ASSERT_SUCCESS(status);
1000     if (U_FAILURE(status)) {
1001         return;
1002     }
1003
1004     if (t->bi == NULL) {
1005         return;
1006     }
1007
1008     t->bi->setText(t->textToBreak, status);
1009     //
1010     //  Run the iterator forward
1011     //
1012     prevBP = -1;
1013     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1014         if (prevBP ==  bp) {
1015             // Fail for lack of forward progress.
1016             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1017                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1018             break;
1019         }
1020
1021         // Check that there we didn't miss an expected break between the last one
1022         //  and this one.
1023         for (i=prevBP+1; i<bp; i++) {
1024             if (t->getExpectedBreak(i) != 0) {
1025                 int expected[] = {0, i};
1026                 printStringBreaks(t->dataToBreak, expected, 2);
1027                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1028                       i, t->getSrcLine(i), t->getSrcCol(i));
1029             }
1030         }
1031
1032         // Check that the break we did find was expected
1033         if (t->getExpectedBreak(bp) == 0) {
1034             int expected[] = {0, bp};
1035             printStringBreaks(t->textToBreak, expected, 2);
1036             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1037                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1038         } else {
1039             // The break was expected.
1040             //   Check that the {nnn} tag value is correct.
1041             int32_t expectedTagVal = t->getExpectedBreak(bp);
1042             if (expectedTagVal == -1) {
1043                 expectedTagVal = 0;
1044             }
1045             int32_t line = t->getSrcLine(bp);
1046             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1047             if (rs != expectedTagVal) {
1048                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1049                       "          Actual, Expected status = %4d, %4d",
1050                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1051             }
1052         }
1053
1054         prevBP = bp;
1055     }
1056
1057     // Verify that there were no missed expected breaks after the last one found
1058     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1059         if (t->getExpectedBreak(i) != 0) {
1060             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1061                       i, t->getSrcLine(i), t->getSrcCol(i));
1062         }
1063     }
1064
1065     //
1066     //  Run the iterator backwards, verify that the same breaks are found.
1067     //
1068     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1069     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1070         if (prevBP ==  bp) {
1071             // Fail for lack of progress.
1072             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1073                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1074             break;
1075         }
1076
1077         // Check that we didn't miss an expected break between the last one
1078         //  and this one.  (UVector returns zeros for index out of bounds.)
1079         for (i=prevBP-1; i>bp; i--) {
1080             if (t->getExpectedBreak(i) != 0) {
1081                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1082                       i, t->getSrcLine(i), t->getSrcCol(i));
1083             }
1084         }
1085
1086         // Check that the break we did find was expected
1087         if (t->getExpectedBreak(bp) == 0) {
1088             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1089                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1090         } else {
1091             // The break was expected.
1092             //   Check that the {nnn} tag value is correct.
1093             int32_t expectedTagVal = t->getExpectedBreak(bp);
1094             if (expectedTagVal == -1) {
1095                 expectedTagVal = 0;
1096             }
1097             int line = t->getSrcLine(bp);
1098             int32_t rs = t->bi->getRuleStatus();
1099             if (rs != expectedTagVal) {
1100                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1101                       "          Actual, Expected status = %4d, %4d",
1102                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1103             }
1104         }
1105
1106         prevBP = bp;
1107     }
1108
1109     // Verify that there were no missed breaks prior to the last one found
1110     for (i=prevBP-1; i>=0; i--) {
1111         if (t->getExpectedBreak(i) != 0) {
1112             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1113                       i, t->getSrcLine(i), t->getSrcCol(i));
1114         }
1115     }
1116
1117     // Check isBoundary()
1118     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1119         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1120         UBool boundaryFound    = t->bi->isBoundary(i);
1121         if (boundaryExpected != boundaryFound) {
1122             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1123                   "        Expected, Actual= %s, %s",
1124                   i, t->getSrcLine(i), t->getSrcCol(i),
1125                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1126         }
1127     }
1128
1129     // Check following()
1130     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1131         int32_t actualBreak = t->bi->following(i);
1132         int32_t expectedBreak = BreakIterator::DONE;
1133         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1134             if (t->getExpectedBreak(j) != 0) {
1135                 expectedBreak = j;
1136                 break;
1137             }
1138         }
1139         if (expectedBreak != actualBreak) {
1140             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1141                   "        Expected, Actual= %d, %d",
1142                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1143         }
1144     }
1145
1146     // Check preceding()
1147     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1148         int32_t actualBreak = t->bi->preceding(i);
1149         int32_t expectedBreak = BreakIterator::DONE;
1150
1151         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1152         // preceding(trailing byte) will return the index of some preceding code point,
1153         // not the lead byte of the current code point, even though that has a smaller index.
1154         // Therefore, start looking at the expected break data not at i-1, but at
1155         // the start of code point index - 1.
1156         utext_setNativeIndex(t->textToBreak, i);
1157         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1158         for (; j >= 0; j--) {
1159             if (t->getExpectedBreak(j) != 0) {
1160                 expectedBreak = j;
1161                 break;
1162             }
1163         }
1164         if (expectedBreak != actualBreak) {
1165             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1166                   "        Expected, Actual= %d, %d",
1167                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1168         }
1169     }
1170 }
1171
1172
1173 void RBBITest::TestExtended() {
1174 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1175     UErrorCode      status  = U_ZERO_ERROR;
1176     Locale          locale("");
1177
1178     UnicodeString       rules;
1179     TestParams          tp(status);
1180
1181     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@&=-]*) *>"), 0, status);
1182     if (U_FAILURE(status)) {
1183         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1184     }
1185
1186
1187     //
1188     //  Open and read the test data file.
1189     //
1190     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1191     char testFileName[1000];
1192     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1193         errln("Can't open test data.  Path too long.");
1194         return;
1195     }
1196     strcpy(testFileName, testDataDirectory);
1197     strcat(testFileName, "rbbitst.txt");
1198
1199     int    len;
1200     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1201     if (U_FAILURE(status)) {
1202         return; /* something went wrong, error already output */
1203     }
1204
1205
1206     bool skipTest = false; // Skip this test?
1207
1208     //
1209     //  Put the test data into a UnicodeString
1210     //
1211     UnicodeString testString(FALSE, testFile, len);
1212
1213     enum EParseState{
1214         PARSE_COMMENT,
1215         PARSE_TAG,
1216         PARSE_DATA,
1217         PARSE_NUM
1218     }
1219     parseState = PARSE_TAG;
1220
1221     EParseState savedState = PARSE_TAG;
1222
1223     static const UChar CH_LF        = 0x0a;
1224     static const UChar CH_CR        = 0x0d;
1225     static const UChar CH_HASH      = 0x23;
1226     /*static const UChar CH_PERIOD    = 0x2e;*/
1227     static const UChar CH_LT        = 0x3c;
1228     static const UChar CH_GT        = 0x3e;
1229     static const UChar CH_BACKSLASH = 0x5c;
1230     static const UChar CH_BULLET    = 0x2022;
1231
1232     int32_t    lineNum  = 1;
1233     int32_t    colStart = 0;
1234     int32_t    column   = 0;
1235     int32_t    charIdx  = 0;
1236
1237     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1238
1239     for (charIdx = 0; charIdx < len; ) {
1240         status = U_ZERO_ERROR;
1241         UChar  c = testString.charAt(charIdx);
1242         charIdx++;
1243         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1244             // treat CRLF as a unit
1245             c = CH_LF;
1246             charIdx++;
1247         }
1248         if (c == CH_LF || c == CH_CR) {
1249             lineNum++;
1250             colStart = charIdx;
1251         }
1252         column = charIdx - colStart + 1;
1253
1254         switch (parseState) {
1255         case PARSE_COMMENT:
1256             if (c == 0x0a || c == 0x0d) {
1257                 parseState = savedState;
1258             }
1259             break;
1260
1261         case PARSE_TAG:
1262             {
1263             if (c == CH_HASH) {
1264                 parseState = PARSE_COMMENT;
1265                 savedState = PARSE_TAG;
1266                 break;
1267             }
1268             if (u_isUWhiteSpace(c)) {
1269                 break;
1270             }
1271             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1272                 delete tp.bi;
1273                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1274                 skipTest = false;
1275                 charIdx += 5;
1276                 break;
1277             }
1278             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1279                 delete tp.bi;
1280                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1281                 skipTest = false;
1282                 charIdx += 5;
1283                 break;
1284             }
1285             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1286                 delete tp.bi;
1287                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1288                 skipTest = false;
1289                 charIdx += 5;
1290                 break;
1291             }
1292             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1293                 delete tp.bi;
1294                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1295                 skipTest = false;
1296                 charIdx += 5;
1297                 break;
1298             }
1299             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1300                 delete tp.bi;
1301                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1302                 charIdx += 6;
1303                 break;
1304             }
1305
1306             // <locale  loc_name>
1307             localeMatcher.reset(testString);
1308             if (localeMatcher.lookingAt(charIdx-1, status)) {
1309                 UnicodeString localeName = localeMatcher.group(1, status);
1310                 char localeName8[100];
1311                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1312                 locale = Locale::createFromName(localeName8);
1313                 charIdx += localeMatcher.group(0, status).length() - 1;
1314                 TEST_ASSERT_SUCCESS(status);
1315                 break;
1316             }
1317             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1318                 parseState = PARSE_DATA;
1319                 charIdx += 5;
1320                 tp.dataToBreak = "";
1321                 tp.expectedBreaks->removeAllElements();
1322                 tp.srcCol ->removeAllElements();
1323                 tp.srcLine->removeAllElements();
1324                 break;
1325             }
1326
1327             errln("line %d: Tag expected in test file.", lineNum);
1328             parseState = PARSE_COMMENT;
1329             savedState = PARSE_DATA;
1330             goto end_test; // Stop the test.
1331             }
1332             break;
1333
1334         case PARSE_DATA:
1335             if (c == CH_BULLET) {
1336                 int32_t  breakIdx = tp.dataToBreak.length();
1337                 tp.expectedBreaks->setSize(breakIdx+1);
1338                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1339                 tp.srcLine->setSize(breakIdx+1);
1340                 tp.srcLine->setElementAt(lineNum, breakIdx);
1341                 tp.srcCol ->setSize(breakIdx+1);
1342                 tp.srcCol ->setElementAt(column, breakIdx);
1343                 break;
1344             }
1345
1346             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1347                 // Add final entry to mappings from break location to source file position.
1348                 //  Need one extra because last break position returned is after the
1349                 //    last char in the data, not at the last char.
1350                 tp.srcLine->addElement(lineNum, status);
1351                 tp.srcCol ->addElement(column, status);
1352
1353                 parseState = PARSE_TAG;
1354                 charIdx += 6;
1355
1356                 if (!skipTest) {
1357                     // RUN THE TEST!
1358                     status = U_ZERO_ERROR;
1359                     tp.setUTF16(status);
1360                     executeTest(&tp, status);
1361                     TEST_ASSERT_SUCCESS(status);
1362
1363                     // Run again, this time with UTF-8 text wrapped in a UText.
1364                     status = U_ZERO_ERROR;
1365                     tp.setUTF8(status);
1366                     TEST_ASSERT_SUCCESS(status);
1367                     executeTest(&tp, status);
1368                 }
1369                 break;
1370             }
1371
1372             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1373                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1374                 // Get the code point from the name and insert it into the test data.
1375                 //   (Damn, no API takes names in Unicode  !!!
1376                 //    we've got to take it back to char *)
1377                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1378                 int32_t nameLength = nameEndIdx - (charIdx+2);
1379                 char charNameBuf[200];
1380                 UChar32 theChar = -1;
1381                 if (nameEndIdx != -1) {
1382                     UErrorCode status = U_ZERO_ERROR;
1383                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1384                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1385                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1386                     if (U_FAILURE(status)) {
1387                         theChar = -1;
1388                     }
1389                 }
1390                 if (theChar == -1) {
1391                     errln("Error in named character in test file at line %d, col %d",
1392                         lineNum, column);
1393                 } else {
1394                     // Named code point was recognized.  Insert it
1395                     //   into the test data.
1396                     tp.dataToBreak.append(theChar);
1397                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1398                         tp.srcLine->addElement(lineNum, status);
1399                         tp.srcCol ->addElement(column, status);
1400                     }
1401                 }
1402                 if (nameEndIdx > charIdx) {
1403                     charIdx = nameEndIdx+1;
1404
1405                 }
1406                 break;
1407             }
1408
1409
1410
1411
1412             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1413                 charIdx++;
1414                 int32_t  breakIdx = tp.dataToBreak.length();
1415                 tp.expectedBreaks->setSize(breakIdx+1);
1416                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1417                 tp.srcLine->setSize(breakIdx+1);
1418                 tp.srcLine->setElementAt(lineNum, breakIdx);
1419                 tp.srcCol ->setSize(breakIdx+1);
1420                 tp.srcCol ->setElementAt(column, breakIdx);
1421                 break;
1422             }
1423
1424             if (c == CH_LT) {
1425                 tagValue   = 0;
1426                 parseState = PARSE_NUM;
1427                 break;
1428             }
1429
1430             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1431                 parseState = PARSE_COMMENT;
1432                 savedState = PARSE_DATA;
1433                 break;
1434             }
1435
1436             if (c == CH_BACKSLASH) {
1437                 // Check for \ at end of line, a line continuation.
1438                 //     Advance over (discard) the newline
1439                 UChar32 cp = testString.char32At(charIdx);
1440                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1441                     // We have a CR LF
1442                     //  Need an extra increment of the input ptr to move over both of them
1443                     charIdx++;
1444                 }
1445                 if (cp == CH_LF || cp == CH_CR) {
1446                     lineNum++;
1447                     colStart = charIdx;
1448                     charIdx++;
1449                     break;
1450                 }
1451
1452                 // Let unescape handle the back slash.
1453                 cp = testString.unescapeAt(charIdx);
1454                 if (cp != -1) {
1455                     // Escape sequence was recognized.  Insert the char
1456                     //   into the test data.
1457                     tp.dataToBreak.append(cp);
1458                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1459                         tp.srcLine->addElement(lineNum, status);
1460                         tp.srcCol ->addElement(column, status);
1461                     }
1462                     break;
1463                 }
1464
1465
1466                 // Not a recognized backslash escape sequence.
1467                 // Take the next char as a literal.
1468                 //  TODO:  Should this be an error?
1469                 c = testString.charAt(charIdx);
1470                 charIdx = testString.moveIndex32(charIdx, 1);
1471             }
1472
1473             // Normal, non-escaped data char.
1474             tp.dataToBreak.append(c);
1475
1476             // Save the mapping from offset in the data to line/column numbers in
1477             //   the original input file.  Will be used for better error messages only.
1478             //   If there's an expected break before this char, the slot in the mapping
1479             //     vector will already be set for this char; don't overwrite it.
1480             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1481                 tp.srcLine->addElement(lineNum, status);
1482                 tp.srcCol ->addElement(column, status);
1483             }
1484             break;
1485
1486
1487         case PARSE_NUM:
1488             // We are parsing an expected numeric tag value, like <1234>,
1489             //   within a chunk of data.
1490             if (u_isUWhiteSpace(c)) {
1491                 break;
1492             }
1493
1494             if (c == CH_GT) {
1495                 // Finished the number.  Add the info to the expected break data,
1496                 //   and switch parse state back to doing plain data.
1497                 parseState = PARSE_DATA;
1498                 if (tagValue == 0) {
1499                     tagValue = -1;
1500                 }
1501                 int32_t  breakIdx = tp.dataToBreak.length();
1502                 tp.expectedBreaks->setSize(breakIdx+1);
1503                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1504                 tp.srcLine->setSize(breakIdx+1);
1505                 tp.srcLine->setElementAt(lineNum, breakIdx);
1506                 tp.srcCol ->setSize(breakIdx+1);
1507                 tp.srcCol ->setElementAt(column, breakIdx);
1508                 break;
1509             }
1510
1511             if (u_isdigit(c)) {
1512                 tagValue = tagValue*10 + u_charDigitValue(c);
1513                 break;
1514             }
1515
1516             errln("Syntax Error in test file at line %d, col %d",
1517                 lineNum, column);
1518             parseState = PARSE_COMMENT;
1519             goto end_test; // Stop the test
1520             break;
1521         }
1522
1523
1524         if (U_FAILURE(status)) {
1525             dataerrln("ICU Error %s while parsing test file at line %d.",
1526                 u_errorName(status), lineNum);
1527             status = U_ZERO_ERROR;
1528             goto end_test; // Stop the test
1529         }
1530
1531     }
1532
1533 end_test:
1534     delete [] testFile;
1535 #endif
1536 }
1537
1538
1539 //-------------------------------------------------------------------------------
1540 //
1541 //  TestDictRules   create a break iterator from source rules that includes a
1542 //                  dictionary range.   Regression for bug #7130.  Source rules
1543 //                  do not declare a break iterator type (word, line, sentence, etc.
1544 //                  but the dictionary code, without a type, would loop.
1545 //
1546 //-------------------------------------------------------------------------------
1547 void RBBITest::TestDictRules() {
1548     const char *rules =  "$dictionary = [a-z]; \n"
1549                          "!!forward; \n"
1550                          "$dictionary $dictionary; \n"
1551                          "!!reverse; \n"
1552                          "$dictionary $dictionary; \n";
1553     const char *text = "aa";
1554     UErrorCode status = U_ZERO_ERROR;
1555     UParseError parseError;
1556
1557     RuleBasedBreakIterator bi(rules, parseError, status);
1558     if (U_SUCCESS(status)) {
1559         UnicodeString utext = text;
1560         bi.setText(utext);
1561         int32_t position;
1562         int32_t loops;
1563         for (loops = 0; loops<10; loops++) {
1564             position = bi.next();
1565             if (position == RuleBasedBreakIterator::DONE) {
1566                 break;
1567             }
1568         }
1569         TEST_ASSERT(loops == 1);
1570     } else {
1571         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1572     }
1573 }
1574
1575
1576
1577 //-------------------------------------------------------------------------------
1578 //
1579 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1580 //    return the data in one big UChar * buffer, which the caller must delete.
1581 //
1582 //    parameters:
1583 //          fileName:   the name of the file, with no directory part.  The test data directory
1584 //                      is assumed.
1585 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1586 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1587 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1588 //                      Pass NULL for the system default encoding.
1589 //          status
1590 //    returns:
1591 //                      The file data, converted to UChar.
1592 //                      The caller must delete this when done with
1593 //                           delete [] theBuffer;
1594 //
1595 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1596 //           Move this function to some common place.
1597 //
1598 //--------------------------------------------------------------------------------
1599 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1600     UChar       *retPtr  = NULL;
1601     char        *fileBuf = NULL;
1602     UConverter* conv     = NULL;
1603     FILE        *f       = NULL;
1604
1605     ulen = 0;
1606     if (U_FAILURE(status)) {
1607         return retPtr;
1608     }
1609
1610     //
1611     //  Open the file.
1612     //
1613     f = fopen(fileName, "rb");
1614     if (f == 0) {
1615         dataerrln("Error opening test data file %s\n", fileName);
1616         status = U_FILE_ACCESS_ERROR;
1617         return NULL;
1618     }
1619     //
1620     //  Read it in
1621     //
1622     int   fileSize;
1623     int   amt_read;
1624
1625     fseek( f, 0, SEEK_END);
1626     fileSize = ftell(f);
1627     fileBuf = new char[fileSize];
1628     fseek(f, 0, SEEK_SET);
1629     amt_read = fread(fileBuf, 1, fileSize, f);
1630     if (amt_read != fileSize || fileSize <= 0) {
1631         errln("Error reading test data file.");
1632         goto cleanUpAndReturn;
1633     }
1634
1635     //
1636     // Look for a Unicode Signature (BOM) on the data just read
1637     //
1638     int32_t        signatureLength;
1639     const char *   fileBufC;
1640     const char*    bomEncoding;
1641
1642     fileBufC = fileBuf;
1643     bomEncoding = ucnv_detectUnicodeSignature(
1644         fileBuf, fileSize, &signatureLength, &status);
1645     if(bomEncoding!=NULL ){
1646         fileBufC  += signatureLength;
1647         fileSize  -= signatureLength;
1648         encoding = bomEncoding;
1649     }
1650
1651     //
1652     // Open a converter to take the rule file to UTF-16
1653     //
1654     conv = ucnv_open(encoding, &status);
1655     if (U_FAILURE(status)) {
1656         goto cleanUpAndReturn;
1657     }
1658
1659     //
1660     // Convert the rules to UChar.
1661     //  Preflight first to determine required buffer size.
1662     //
1663     ulen = ucnv_toUChars(conv,
1664         NULL,           //  dest,
1665         0,              //  destCapacity,
1666         fileBufC,
1667         fileSize,
1668         &status);
1669     if (status == U_BUFFER_OVERFLOW_ERROR) {
1670         // Buffer Overflow is expected from the preflight operation.
1671         status = U_ZERO_ERROR;
1672
1673         retPtr = new UChar[ulen+1];
1674         ucnv_toUChars(conv,
1675             retPtr,       //  dest,
1676             ulen+1,
1677             fileBufC,
1678             fileSize,
1679             &status);
1680     }
1681
1682 cleanUpAndReturn:
1683     fclose(f);
1684     delete []fileBuf;
1685     ucnv_close(conv);
1686     if (U_FAILURE(status)) {
1687         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1688         delete []retPtr;
1689         retPtr = 0;
1690         ulen   = 0;
1691     };
1692     return retPtr;
1693 }
1694
1695
1696
1697 //--------------------------------------------------------------------------------------------
1698 //
1699 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1700 //
1701 //-------------------------------------------------------------------------------------------
1702 void RBBITest::TestUnicodeFiles() {
1703     RuleBasedBreakIterator  *bi;
1704     UErrorCode               status = U_ZERO_ERROR;
1705
1706     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1707     TEST_ASSERT_SUCCESS(status);
1708     if (U_SUCCESS(status)) {
1709         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1710     }
1711     delete bi;
1712
1713     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1714     TEST_ASSERT_SUCCESS(status);
1715     if (U_SUCCESS(status)) {
1716         runUnicodeTestData("WordBreakTest.txt", bi);
1717     }
1718     delete bi;
1719
1720     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1721     TEST_ASSERT_SUCCESS(status);
1722     if (U_SUCCESS(status)) {
1723         runUnicodeTestData("SentenceBreakTest.txt", bi);
1724     }
1725     delete bi;
1726
1727     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1728     TEST_ASSERT_SUCCESS(status);
1729     if (U_SUCCESS(status)) {
1730         runUnicodeTestData("LineBreakTest.txt", bi);
1731     }
1732     delete bi;
1733 }
1734
1735
1736 // Check for test cases from the Unicode test data files that are known to fail
1737 // and should be skipped because ICU is not yet able to fully implement the spec.
1738 // See ticket #7270.
1739
1740 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1741     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1742         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1743         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1744         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1745         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1746         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1747         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1748     };
1749     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1750         return FALSE;
1751     }
1752
1753     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1754         if (testCase == UnicodeString(badTestCases[i])) {
1755             return logKnownIssue("7270");
1756         }
1757     }
1758     return FALSE;
1759 }
1760
1761
1762 //--------------------------------------------------------------------------------------------
1763 //
1764 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1765 //
1766 //-------------------------------------------------------------------------------------------
1767 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1768 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1769     UErrorCode  status = U_ZERO_ERROR;
1770
1771     //
1772     //  Open and read the test data file, put it into a UnicodeString.
1773     //
1774     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1775     char testFileName[1000];
1776     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1777         dataerrln("Can't open test data.  Path too long.");
1778         return;
1779     }
1780     strcpy(testFileName, testDataDirectory);
1781     strcat(testFileName, fileName);
1782
1783     logln("Opening data file %s\n", fileName);
1784
1785     int    len;
1786     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1787     if (status != U_FILE_ACCESS_ERROR) {
1788         TEST_ASSERT_SUCCESS(status);
1789         TEST_ASSERT(testFile != NULL);
1790     }
1791     if (U_FAILURE(status) || testFile == NULL) {
1792         return; /* something went wrong, error already output */
1793     }
1794     UnicodeString testFileAsString(TRUE, testFile, len);
1795
1796     //
1797     //  Parse the test data file using a regular expression.
1798     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1799     //     is identified by which group had a match.
1800     //
1801     //    Caputure Group #                  1          2            3            4           5
1802     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1803     //
1804     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1805     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1806     UnicodeString   testString;
1807     UVector32       breakPositions(status);
1808     int             lineNumber = 1;
1809     TEST_ASSERT_SUCCESS(status);
1810     if (U_FAILURE(status)) {
1811         return;
1812     }
1813
1814     //
1815     //  Scan through each test case, building up the string to be broken in testString,
1816     //   and the positions that should be boundaries in the breakPositions vector.
1817     //
1818     int spin = 0;
1819     while (tokenMatcher.find()) {
1820         if(tokenMatcher.hitEnd()) {
1821           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1822              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1823              and caused an infinite loop here on EBCDIC systems!
1824           */
1825           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1826           //       return;
1827         }
1828         if (tokenMatcher.start(1, status) >= 0) {
1829             // Scanned a divide sign, indicating a break position in the test data.
1830             if (testString.length()>0) {
1831                 breakPositions.addElement(testString.length(), status);
1832             }
1833         }
1834         else if (tokenMatcher.start(2, status) >= 0) {
1835             // Scanned an 'x', meaning no break at this position in the test data
1836             //   Nothing to be done here.
1837             }
1838         else if (tokenMatcher.start(3, status) >= 0) {
1839             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1840             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1841             int length = hexNumber.length();
1842             if (length<=8) {
1843                 char buf[10];
1844                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1845                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1846                 if (c<=0x10ffff) {
1847                     testString.append(c);
1848                 } else {
1849                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1850                        fileName, lineNumber);
1851                 }
1852             } else {
1853                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1854                        fileName, lineNumber);
1855              }
1856         }
1857         else if (tokenMatcher.start(4, status) >= 0) {
1858             // Scanned to end of a line, possibly skipping over a comment in the process.
1859             //   If the line from the file contained test data, run the test now.
1860             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1861                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1862             }
1863
1864             // Clear out this test case.
1865             //    The string and breakPositions vector will be refilled as the next
1866             //       test case is parsed.
1867             testString.remove();
1868             breakPositions.removeAllElements();
1869             lineNumber++;
1870         } else {
1871             // Scanner catchall.  Something unrecognized appeared on the line.
1872             char token[16];
1873             UnicodeString uToken = tokenMatcher.group(0, status);
1874             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1875             token[sizeof(token)-1] = 0;
1876             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1877
1878             // Clean up, in preparation for continuing with the next line.
1879             testString.remove();
1880             breakPositions.removeAllElements();
1881             lineNumber++;
1882         }
1883         TEST_ASSERT_SUCCESS(status);
1884         if (U_FAILURE(status)) {
1885             break;
1886         }
1887     }
1888
1889     delete [] testFile;
1890  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1891 }
1892
1893 //--------------------------------------------------------------------------------------------
1894 //
1895 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1896 //                            test data files.  Do only a simple, forward-only check -
1897 //                            this test is mostly to check that ICU and the Unicode
1898 //                            data agree with each other.
1899 //
1900 //--------------------------------------------------------------------------------------------
1901 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1902                          const UnicodeString &testString,   // Text data to be broken
1903                          UVector32 *breakPositions,         // Positions where breaks should be found.
1904                          RuleBasedBreakIterator *bi) {
1905     int32_t pos;                 // Break Position in the test string
1906     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1907     int32_t expectedPos;         // Expected break position (index into test string)
1908
1909     bi->setText(testString);
1910     pos = bi->first();
1911     pos = bi->next();
1912
1913     while (pos != BreakIterator::DONE) {
1914         if (expectedI >= breakPositions->size()) {
1915             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1916                 testFileName, lineNumber, pos);
1917             break;
1918         }
1919         expectedPos = breakPositions->elementAti(expectedI);
1920         if (pos < expectedPos) {
1921             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1922                 testFileName, lineNumber, pos);
1923             break;
1924         }
1925         if (pos > expectedPos) {
1926             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1927                 testFileName, lineNumber, expectedPos);
1928             break;
1929         }
1930         pos = bi->next();
1931         expectedI++;
1932     }
1933
1934     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1935         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1936             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1937     }
1938 }
1939
1940
1941
1942 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1943 //---------------------------------------------------------------------------------------
1944 //
1945 //   classs RBBIMonkeyKind
1946 //
1947 //      Monkey Test for Break Iteration
1948 //      Abstract interface class.   Concrete derived classes independently
1949 //      implement the break rules for different iterator types.
1950 //
1951 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1952 //      testing, but works purely in terms of the interface defined here.
1953 //
1954 //---------------------------------------------------------------------------------------
1955 class RBBIMonkeyKind {
1956 public:
1957     // Return a UVector of UnicodeSets, representing the character classes used
1958     //   for this type of iterator.
1959     virtual  UVector  *charClasses() = 0;
1960
1961     // Set the test text on which subsequent calls to next() will operate
1962     virtual  void      setText(const UnicodeString &s) = 0;
1963
1964     // Find the next break postion, starting from the prev break position, or from zero.
1965     // Return -1 after reaching end of string.
1966     virtual  int32_t   next(int32_t i) = 0;
1967
1968     virtual ~RBBIMonkeyKind();
1969     UErrorCode       deferredStatus;
1970
1971
1972 protected:
1973     RBBIMonkeyKind();
1974
1975 private:
1976 };
1977
1978 RBBIMonkeyKind::RBBIMonkeyKind() {
1979     deferredStatus = U_ZERO_ERROR;
1980 }
1981
1982 RBBIMonkeyKind::~RBBIMonkeyKind() {
1983 }
1984
1985
1986 //----------------------------------------------------------------------------------------
1987 //
1988 //   Random Numbers.  Similar to standard lib rand() and srand()
1989 //                    Not using library to
1990 //                      1.  Get same results on all platforms.
1991 //                      2.  Get access to current seed, to more easily reproduce failures.
1992 //
1993 //---------------------------------------------------------------------------------------
1994 static uint32_t m_seed = 1;
1995
1996 static uint32_t m_rand()
1997 {
1998     m_seed = m_seed * 1103515245 + 12345;
1999     return (uint32_t)(m_seed/65536) % 32768;
2000 }
2001
2002
2003 //------------------------------------------------------------------------------------------
2004 //
2005 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2006 //                             of RBBIMonkeyKind.
2007 //
2008 //------------------------------------------------------------------------------------------
2009 class RBBICharMonkey: public RBBIMonkeyKind {
2010 public:
2011     RBBICharMonkey();
2012     virtual          ~RBBICharMonkey();
2013     virtual  UVector *charClasses();
2014     virtual  void     setText(const UnicodeString &s);
2015     virtual  int32_t  next(int32_t i);
2016 private:
2017     UVector   *fSets;
2018
2019     UnicodeSet  *fCRLFSet;
2020     UnicodeSet  *fControlSet;
2021     UnicodeSet  *fExtendSet;
2022     UnicodeSet  *fRegionalIndicatorSet;
2023     UnicodeSet  *fPrependSet;
2024     UnicodeSet  *fSpacingSet;
2025     UnicodeSet  *fLSet;
2026     UnicodeSet  *fVSet;
2027     UnicodeSet  *fTSet;
2028     UnicodeSet  *fLVSet;
2029     UnicodeSet  *fLVTSet;
2030     UnicodeSet  *fHangulSet;
2031     UnicodeSet  *fAnySet;
2032     UnicodeSet  *fEmojiModifierSet;
2033     UnicodeSet  *fEmojiBaseSet;
2034     UnicodeSet  *fZWJSet;
2035     UnicodeSet  *fGAZSet;
2036
2037     const UnicodeString *fText;
2038 };
2039
2040
2041 RBBICharMonkey::RBBICharMonkey() {
2042     UErrorCode  status = U_ZERO_ERROR;
2043
2044     fText = NULL;
2045
2046     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2047     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Control}]-[:Block=Tags:]]"), status);
2048     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[[\\p{Grapheme_Cluster_Break = Extend}][:Block=Tags:]]"), status);
2049     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2050     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2051     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2052     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2053     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2054     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2055     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2056     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2057     fHangulSet  = new UnicodeSet();
2058     fHangulSet->addAll(*fLSet);
2059     fHangulSet->addAll(*fVSet);
2060     fHangulSet->addAll(*fTSet);
2061     fHangulSet->addAll(*fLVSet);
2062     fHangulSet->addAll(*fLVTSet);
2063     fAnySet     = new UnicodeSet(0, 0x10ffff);
2064
2065
2066
2067     fEmojiBaseSet = new UnicodeSet(UnicodeString(
2068                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2069                 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2070                 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2071                 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2072                 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
2073
2074     fEmojiModifierSet = new UnicodeSet(0x0001F3FB, 0x0001F3FF);
2075     fZWJSet           = new UnicodeSet(0x200D, 0x200D);
2076     fGAZSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2077                                         "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2078                                         "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2079                                         "\\U0001F680\\U0001F692]"), status);
2080
2081     fSets       = new UVector(status);
2082     fSets->addElement(fCRLFSet,    status);
2083     fSets->addElement(fControlSet, status);
2084     fSets->addElement(fExtendSet,  status);
2085     fSets->addElement(fRegionalIndicatorSet, status);
2086     if (!fPrependSet->isEmpty()) {
2087         fSets->addElement(fPrependSet, status);
2088     }
2089     fSets->addElement(fSpacingSet, status);
2090     fSets->addElement(fHangulSet,  status);
2091     fSets->addElement(fAnySet,     status);
2092     fSets->addElement(fEmojiBaseSet, status);
2093     fSets->addElement(fEmojiModifierSet, status);
2094     fSets->addElement(fZWJSet,     status);
2095     fSets->addElement(fGAZSet,     status);
2096     if (U_FAILURE(status)) {
2097         deferredStatus = status;
2098     }
2099 }
2100
2101
2102 void RBBICharMonkey::setText(const UnicodeString &s) {
2103     fText = &s;
2104 }
2105
2106
2107
2108 int32_t RBBICharMonkey::next(int32_t prevPos) {
2109     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2110                               //   break position being tested.  The candidate break
2111                               //   location is before p2.
2112
2113     int     breakPos = -1;
2114
2115     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2116     UChar32 cBase;            // for (X Extend*) patterns, the X character.
2117
2118     if (U_FAILURE(deferredStatus)) {
2119         return -1;
2120     }
2121
2122     // Previous break at end of string.  return DONE.
2123     if (prevPos >= fText->length()) {
2124         return -1;
2125     }
2126     p0 = p1 = p2 = p3 = prevPos;
2127     c3 =  fText->char32At(prevPos);
2128     c0 = c1 = c2 = cBase = 0;
2129     (void)p0;   // suppress set but not used warning.
2130     (void)c0;
2131
2132     // Loop runs once per "significant" character position in the input text.
2133     for (;;) {
2134         // Move all of the positions forward in the input string.
2135         p0 = p1;  c0 = c1;
2136         p1 = p2;  c1 = c2;
2137         p2 = p3;  c2 = c3;
2138
2139         // Advancd p3 by one codepoint
2140         p3 = fText->moveIndex32(p3, 1);
2141         c3 = fText->char32At(p3);
2142
2143         if (p1 == p2) {
2144             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2145             continue;
2146         }
2147         if (p2 == fText->length()) {
2148             // Reached end of string.  Always a break position.
2149             break;
2150         }
2151
2152         // Rule  GB3   CR x LF
2153         //     No Extend or Format characters may appear between the CR and LF,
2154         //     which requires the additional check for p2 immediately following p1.
2155         //
2156         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2157             continue;
2158         }
2159
2160         // Rule (GB4).   ( Control | CR | LF ) <break>
2161         if (fControlSet->contains(c1) ||
2162             c1 == 0x0D ||
2163             c1 == 0x0A)  {
2164             break;
2165         }
2166
2167         // Rule (GB5)    <break>  ( Control | CR | LF )
2168         //
2169         if (fControlSet->contains(c2) ||
2170             c2 == 0x0D ||
2171             c2 == 0x0A)  {
2172             break;
2173         }
2174
2175
2176         // Rule (GB6)  L x ( L | V | LV | LVT )
2177         if (fLSet->contains(c1) &&
2178                (fLSet->contains(c2)  ||
2179                 fVSet->contains(c2)  ||
2180                 fLVSet->contains(c2) ||
2181                 fLVTSet->contains(c2))) {
2182             continue;
2183         }
2184
2185         // Rule (GB7)    ( LV | V )  x  ( V | T )
2186         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2187             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2188             continue;
2189         }
2190
2191         // Rule (GB8)    ( LVT | T)  x T
2192         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2193             fTSet->contains(c2))  {
2194             continue;
2195         }
2196
2197         // Rule (GB9)    x (Extend | ZWJ)
2198         if (fExtendSet->contains(c2) || fZWJSet->contains(c2))  {
2199             if (!fExtendSet->contains(c1)) {
2200                 cBase = c1;
2201             }
2202             continue;
2203         }
2204
2205         // Rule (GB9a)   x  SpacingMark
2206         if (fSpacingSet->contains(c2)) {
2207             continue;
2208         }
2209
2210         // Rule (GB9b)   Prepend x
2211         if (fPrependSet->contains(c1)) {
2212             continue;
2213         }
2214
2215         // Rule (GB10)   ($E_Base | $GAZ) $Extend* $E_Modifier;
2216         if ((fEmojiBaseSet->contains(c1) || fGAZSet->contains(c1)) && fEmojiModifierSet->contains(c2)) {
2217             continue;
2218         }
2219         if ((fEmojiBaseSet->contains(cBase) || fGAZSet->contains(cBase)) &&
2220                 fExtendSet->contains(c1) && fEmojiModifierSet->contains(c2)) {
2221             continue;
2222         }
2223
2224         // Rule (GB11)   ZWJ x Glue_After_Zwj
2225         if (fZWJSet->contains(c1) && fGAZSet->contains(c2)) {
2226             continue;
2227         }
2228
2229         // Rule (GB12-13)    Regional_Indicator x Regional_Indicator
2230         //                   Note: The first if condition is a little tricky. We only need to force
2231         //                      a break if there are three or more contiguous RIs. If there are
2232         //                      only two, a break following will occur via other rules, and will include
2233         //                      any trailing extend characters, which is needed behavior.
2234         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)
2235                 && fRegionalIndicatorSet->contains(c2)) {
2236             break;
2237         }
2238         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2239             continue;
2240         }
2241
2242         // Rule (GB999)  Any  <break>  Any
2243         break;
2244     }
2245
2246     breakPos = p2;
2247     return breakPos;
2248 }
2249
2250
2251
2252 UVector  *RBBICharMonkey::charClasses() {
2253     return fSets;
2254 }
2255
2256
2257 RBBICharMonkey::~RBBICharMonkey() {
2258     delete fSets;
2259     delete fCRLFSet;
2260     delete fControlSet;
2261     delete fExtendSet;
2262     delete fRegionalIndicatorSet;
2263     delete fPrependSet;
2264     delete fSpacingSet;
2265     delete fLSet;
2266     delete fVSet;
2267     delete fTSet;
2268     delete fLVSet;
2269     delete fLVTSet;
2270     delete fHangulSet;
2271     delete fAnySet;
2272     delete fEmojiBaseSet;
2273     delete fEmojiModifierSet;
2274     delete fZWJSet;
2275     delete fGAZSet;
2276 }
2277
2278 //------------------------------------------------------------------------------------------
2279 //
2280 //   class RBBIWordMonkey      Word Break specific implementation
2281 //                             of RBBIMonkeyKind.
2282 //
2283 //------------------------------------------------------------------------------------------
2284 class RBBIWordMonkey: public RBBIMonkeyKind {
2285 public:
2286     RBBIWordMonkey();
2287     virtual          ~RBBIWordMonkey();
2288     virtual  UVector *charClasses();
2289     virtual  void     setText(const UnicodeString &s);
2290     virtual int32_t   next(int32_t i);
2291 private:
2292     UVector      *fSets;
2293
2294     UnicodeSet  *fCRSet;
2295     UnicodeSet  *fLFSet;
2296     UnicodeSet  *fNewlineSet;
2297     UnicodeSet  *fRegionalIndicatorSet;
2298     UnicodeSet  *fKatakanaSet;
2299     UnicodeSet  *fHebrew_LetterSet;
2300     UnicodeSet  *fALetterSet;
2301     // TODO(jungshik): Do we still need this change?
2302     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2303     UnicodeSet  *fSingle_QuoteSet;
2304     UnicodeSet  *fDouble_QuoteSet;
2305     UnicodeSet  *fMidNumLetSet;
2306     UnicodeSet  *fMidLetterSet;
2307     UnicodeSet  *fMidNumSet;
2308     UnicodeSet  *fNumericSet;
2309     UnicodeSet  *fFormatSet;
2310     UnicodeSet  *fOtherSet;
2311     UnicodeSet  *fExtendSet;
2312     UnicodeSet  *fExtendNumLetSet;
2313     UnicodeSet  *fDictionaryCjkSet;
2314     UnicodeSet  *fEBaseSet;
2315     UnicodeSet  *fEModifierSet;
2316     UnicodeSet  *fZWSSet;
2317     UnicodeSet  *fGAZSet;
2318
2319     const UnicodeString  *fText;
2320 };
2321
2322
2323 RBBIWordMonkey::RBBIWordMonkey()
2324 {
2325     UErrorCode  status = U_ZERO_ERROR;
2326
2327     fSets            = new UVector(status);
2328
2329     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2330     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2331     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2332     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2333     // Exclude Hangul syllables from ALetterSet during testing.
2334     // Leave CJK dictionary characters out from the monkey tests!
2335 #if 0
2336     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2337                                       "[\\p{Line_Break = Complex_Context}"
2338                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2339                                       "-\\p{Grapheme_Cluster_Break = Control}"
2340                                       "]]",
2341                                       status);
2342 #endif
2343     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2344     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2345     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2346     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2347     fALetterSet->removeAll(*fDictionaryCjkSet);
2348     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2349     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2350     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2351     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter} - [\\:]]"),    status);
2352     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2353     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2354     // we should figure out why
2355     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2356     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2357     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2358     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2359
2360     fEBaseSet         = new UnicodeSet(UnicodeString(
2361                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
2362                 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
2363                 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
2364                 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
2365                 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
2366
2367     fEModifierSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
2368     fZWSSet          = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);;
2369     fGAZSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u2640\\u2642\\u2695-\\u2696\\u2708\\u2764"
2370                                         "\\U0001F308\\U0001F33E\\U0001F373\\U0001F393\\U0001F3A4\\U0001F3A8\\U0001F3EB\\U0001F3ED"
2371                                         "\\U0001F466-\\U0001F469\\U0001F48B\\U0001F4BB-\\U0001F4BC\\U0001F527\\U0001F52C\\U0001F5E8"
2372                                         "\\U0001F680\\U0001F692]"), status);
2373     fExtendSet->removeAll(*fZWSSet);
2374
2375
2376     fOtherSet        = new UnicodeSet();
2377     if(U_FAILURE(status)) {
2378       deferredStatus = status;
2379       return;
2380     }
2381
2382     fOtherSet->complement();
2383     fOtherSet->removeAll(*fCRSet);
2384     fOtherSet->removeAll(*fLFSet);
2385     fOtherSet->removeAll(*fNewlineSet);
2386     fOtherSet->removeAll(*fKatakanaSet);
2387     fOtherSet->removeAll(*fHebrew_LetterSet);
2388     fOtherSet->removeAll(*fALetterSet);
2389     fOtherSet->removeAll(*fSingle_QuoteSet);
2390     fOtherSet->removeAll(*fDouble_QuoteSet);
2391     fOtherSet->removeAll(*fMidLetterSet);
2392     fOtherSet->removeAll(*fMidNumSet);
2393     fOtherSet->removeAll(*fNumericSet);
2394     fOtherSet->removeAll(*fExtendNumLetSet);
2395     fOtherSet->removeAll(*fFormatSet);
2396     fOtherSet->removeAll(*fExtendSet);
2397     fOtherSet->removeAll(*fRegionalIndicatorSet);
2398     fOtherSet->removeAll(*fEBaseSet);
2399     fOtherSet->removeAll(*fEModifierSet);
2400     fOtherSet->removeAll(*fZWSSet);
2401     fOtherSet->removeAll(*fGAZSet);
2402
2403     // Inhibit dictionary characters from being tested at all.
2404     fOtherSet->removeAll(*fDictionaryCjkSet);
2405     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2406
2407     fSets->addElement(fCRSet,                status);
2408     fSets->addElement(fLFSet,                status);
2409     fSets->addElement(fNewlineSet,           status);
2410     fSets->addElement(fRegionalIndicatorSet, status);
2411     fSets->addElement(fHebrew_LetterSet,     status);
2412     fSets->addElement(fALetterSet,           status);
2413     fSets->addElement(fSingle_QuoteSet,      status);
2414     fSets->addElement(fDouble_QuoteSet,      status);
2415     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2416     fSets->addElement(fMidLetterSet,         status);
2417     fSets->addElement(fMidNumLetSet,         status);
2418     fSets->addElement(fMidNumSet,            status);
2419     fSets->addElement(fNumericSet,           status);
2420     fSets->addElement(fFormatSet,            status);
2421     fSets->addElement(fExtendSet,            status);
2422     fSets->addElement(fOtherSet,             status);
2423     fSets->addElement(fExtendNumLetSet,      status);
2424
2425     fSets->addElement(fEBaseSet,             status);
2426     fSets->addElement(fEModifierSet,         status);
2427     fSets->addElement(fZWSSet,               status);
2428     fSets->addElement(fGAZSet,               status);
2429
2430     if (U_FAILURE(status)) {
2431         deferredStatus = status;
2432     }
2433 }
2434
2435 void RBBIWordMonkey::setText(const UnicodeString &s) {
2436     fText       = &s;
2437 }
2438
2439
2440 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2441     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2442                               //   break position being tested.  The candidate break
2443                               //   location is before p2.
2444
2445     int     breakPos = -1;
2446
2447     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2448
2449     if (U_FAILURE(deferredStatus)) {
2450         return -1;
2451     }
2452
2453     // Prev break at end of string.  return DONE.
2454     if (prevPos >= fText->length()) {
2455         return -1;
2456     }
2457     p0 = p1 = p2 = p3 = prevPos;
2458     c3 =  fText->char32At(prevPos);
2459     c0 = c1 = c2 = 0;
2460     (void)p0;       // Suppress set but not used warning.
2461
2462     // Loop runs once per "significant" character position in the input text.
2463     for (;;) {
2464         // Move all of the positions forward in the input string.
2465         p0 = p1;  c0 = c1;
2466         p1 = p2;  c1 = c2;
2467         p2 = p3;  c2 = c3;
2468
2469         // Advancd p3 by    X(Extend | Format)*   Rule 4
2470         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2471         do {
2472             p3 = fText->moveIndex32(p3, 1);
2473             c3 = fText->char32At(p3);
2474             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2475                break;
2476             };
2477         }
2478         while (fFormatSet->contains(c3) || fExtendSet->contains(c3) || fZWSSet->contains(c3));
2479
2480
2481         if (p1 == p2) {
2482             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2483             continue;
2484         }
2485         if (p2 == fText->length()) {
2486             // Reached end of string.  Always a break position.
2487             break;
2488         }
2489
2490         // Rule  (3)   CR x LF
2491         //     No Extend or Format characters may appear between the CR and LF,
2492         //     which requires the additional check for p2 immediately following p1.
2493         //
2494         if (c1==0x0D && c2==0x0A) {
2495             continue;
2496         }
2497
2498         // Rule (3a)  Break before and after newlines (including CR and LF)
2499         //
2500         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2501             break;
2502         };
2503         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2504             break;
2505         };
2506
2507         // Rule (3c)    ZWJ x GAZ (Glue after ZWJ).
2508         //              Not ignoring extend chars, so peek into input text to
2509         //              get the potential ZWJ, the character immediately preceding c2.
2510         //              Sloppy UChar32 indexing: p2-1 may reference trail half
2511         //              but char32At will get the full code point.
2512         if (fZWSSet->contains(fText->char32At(p2-1)) && fGAZSet->contains(c2)) {
2513             continue;
2514         }
2515
2516         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2517         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2518             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2519             continue;
2520         }
2521
2522         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2523         //
2524         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2525              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2526              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2527             continue;
2528         }
2529
2530         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2531         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2532             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2533             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2534             continue;
2535         }
2536
2537         // Rule (7a)     Hebrew_Letter x Single_Quote
2538         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2539             continue;
2540         }
2541
2542         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2543         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2544             continue;
2545         }
2546
2547         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2548         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2549             continue;
2550         }
2551
2552         // Rule (8)    Numeric x Numeric
2553         if (fNumericSet->contains(c1) &&
2554             fNumericSet->contains(c2))  {
2555             continue;
2556         }
2557
2558         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2559         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2560             fNumericSet->contains(c2))  {
2561             continue;
2562         }
2563
2564         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2565         if (fNumericSet->contains(c1) &&
2566             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2567             continue;
2568         }
2569
2570         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2571         if (fNumericSet->contains(c0) &&
2572             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2573             fNumericSet->contains(c2)) {
2574             continue;
2575         }
2576
2577         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2578         if (fNumericSet->contains(c1) &&
2579             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2580             fNumericSet->contains(c3)) {
2581             continue;
2582         }
2583
2584         // Rule (13)  Katakana x Katakana
2585         if (fKatakanaSet->contains(c1) &&
2586             fKatakanaSet->contains(c2))  {
2587             continue;
2588         }
2589
2590         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2591         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2592              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2593              fExtendNumLetSet->contains(c2)) {
2594                 continue;
2595         }
2596
2597         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2598         if (fExtendNumLetSet->contains(c1) &&
2599                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2600                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2601             continue;
2602         }
2603
2604         // Rule 13c
2605         if (fRegionalIndicatorSet->contains(c0) && fRegionalIndicatorSet->contains(c1)) {
2606             break;
2607         }
2608         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2609             continue;
2610         }
2611
2612         // Rule 13d
2613         if ((fEBaseSet->contains(c1)  || fGAZSet->contains(c1)) && fEModifierSet->contains(c2)) {
2614             continue;
2615         }
2616
2617         // Rule 14.  Break found here.
2618         break;
2619     }
2620
2621     breakPos = p2;
2622     return breakPos;
2623 }
2624
2625
2626 UVector  *RBBIWordMonkey::charClasses() {
2627     return fSets;
2628 }
2629
2630
2631 RBBIWordMonkey::~RBBIWordMonkey() {
2632     delete fSets;
2633     delete fCRSet;
2634     delete fLFSet;
2635     delete fNewlineSet;
2636     delete fKatakanaSet;
2637     delete fHebrew_LetterSet;
2638     delete fALetterSet;
2639     delete fSingle_QuoteSet;
2640     delete fDouble_QuoteSet;
2641     delete fMidNumLetSet;
2642     delete fMidLetterSet;
2643     delete fMidNumSet;
2644     delete fNumericSet;
2645     delete fFormatSet;
2646     delete fExtendSet;
2647     delete fExtendNumLetSet;
2648     delete fRegionalIndicatorSet;
2649     delete fDictionaryCjkSet;
2650     delete fOtherSet;
2651     delete fEBaseSet;
2652     delete fEModifierSet;
2653     delete fZWSSet;
2654     delete fGAZSet;
2655 }
2656
2657
2658
2659
2660 //------------------------------------------------------------------------------------------
2661 //
2662 //   class RBBISentMonkey      Sentence Break specific implementation
2663 //                             of RBBIMonkeyKind.
2664 //
2665 //------------------------------------------------------------------------------------------
2666 class RBBISentMonkey: public RBBIMonkeyKind {
2667 public:
2668     RBBISentMonkey();
2669     virtual          ~RBBISentMonkey();
2670     virtual  UVector *charClasses();
2671     virtual  void     setText(const UnicodeString &s);
2672     virtual int32_t   next(int32_t i);
2673 private:
2674     int               moveBack(int posFrom);
2675     int               moveForward(int posFrom);
2676     UChar32           cAt(int pos);
2677
2678     UVector      *fSets;
2679
2680     UnicodeSet  *fSepSet;
2681     UnicodeSet  *fFormatSet;
2682     UnicodeSet  *fSpSet;
2683     UnicodeSet  *fLowerSet;
2684     UnicodeSet  *fUpperSet;
2685     UnicodeSet  *fOLetterSet;
2686     UnicodeSet  *fNumericSet;
2687     UnicodeSet  *fATermSet;
2688     UnicodeSet  *fSContinueSet;
2689     UnicodeSet  *fSTermSet;
2690     UnicodeSet  *fCloseSet;
2691     UnicodeSet  *fOtherSet;
2692     UnicodeSet  *fExtendSet;
2693
2694     const UnicodeString  *fText;
2695
2696 };
2697
2698 RBBISentMonkey::RBBISentMonkey()
2699 {
2700     UErrorCode  status = U_ZERO_ERROR;
2701
2702     fSets            = new UVector(status);
2703
2704     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2705     //                       set and made into character classes of their own.  For the monkey impl,
2706     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2707     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2708     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2709     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2710     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2711     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2712     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2713     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2714     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2715     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2716     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2717     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2718     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2719     fOtherSet        = new UnicodeSet();
2720
2721     if(U_FAILURE(status)) {
2722       deferredStatus = status;
2723       return;
2724     }
2725
2726     fOtherSet->complement();
2727     fOtherSet->removeAll(*fSepSet);
2728     fOtherSet->removeAll(*fFormatSet);
2729     fOtherSet->removeAll(*fSpSet);
2730     fOtherSet->removeAll(*fLowerSet);
2731     fOtherSet->removeAll(*fUpperSet);
2732     fOtherSet->removeAll(*fOLetterSet);
2733     fOtherSet->removeAll(*fNumericSet);
2734     fOtherSet->removeAll(*fATermSet);
2735     fOtherSet->removeAll(*fSContinueSet);
2736     fOtherSet->removeAll(*fSTermSet);
2737     fOtherSet->removeAll(*fCloseSet);
2738     fOtherSet->removeAll(*fExtendSet);
2739
2740     fSets->addElement(fSepSet,       status);
2741     fSets->addElement(fFormatSet,    status);
2742     fSets->addElement(fSpSet,        status);
2743     fSets->addElement(fLowerSet,     status);
2744     fSets->addElement(fUpperSet,     status);
2745     fSets->addElement(fOLetterSet,   status);
2746     fSets->addElement(fNumericSet,   status);
2747     fSets->addElement(fATermSet,     status);
2748     fSets->addElement(fSContinueSet, status);
2749     fSets->addElement(fSTermSet,     status);
2750     fSets->addElement(fCloseSet,     status);
2751     fSets->addElement(fOtherSet,     status);
2752     fSets->addElement(fExtendSet,    status);
2753
2754     if (U_FAILURE(status)) {
2755         deferredStatus = status;
2756     }
2757 }
2758
2759
2760
2761 void RBBISentMonkey::setText(const UnicodeString &s) {
2762     fText       = &s;
2763 }
2764
2765 UVector  *RBBISentMonkey::charClasses() {
2766     return fSets;
2767 }
2768
2769
2770 //  moveBack()   Find the "significant" code point preceding the index i.
2771 //               Skips over ($Extend | $Format)* .
2772 //
2773 int RBBISentMonkey::moveBack(int i) {
2774     if (i <= 0) {
2775         return -1;
2776     }
2777     UChar32   c;
2778     int32_t   j = i;
2779     do {
2780         j = fText->moveIndex32(j, -1);
2781         c = fText->char32At(j);
2782     }
2783     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2784     return j;
2785
2786  }
2787
2788
2789 int RBBISentMonkey::moveForward(int i) {
2790     if (i>=fText->length()) {
2791         return fText->length();
2792     }
2793     UChar32   c;
2794     int32_t   j = i;
2795     do {
2796         j = fText->moveIndex32(j, 1);
2797         c = cAt(j);
2798     }
2799     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2800     return j;
2801 }
2802
2803 UChar32 RBBISentMonkey::cAt(int pos) {
2804     if (pos<0 || pos>=fText->length()) {
2805         return -1;
2806     } else {
2807         return fText->char32At(pos);
2808     }
2809 }
2810
2811 int32_t RBBISentMonkey::next(int32_t prevPos) {
2812     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2813                               //   break position being tested.  The candidate break
2814                               //   location is before p2.
2815
2816     int     breakPos = -1;
2817
2818     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2819     UChar32 c;
2820
2821     if (U_FAILURE(deferredStatus)) {
2822         return -1;
2823     }
2824
2825     // Prev break at end of string.  return DONE.
2826     if (prevPos >= fText->length()) {
2827         return -1;
2828     }
2829     p0 = p1 = p2 = p3 = prevPos;
2830     c3 =  fText->char32At(prevPos);
2831     c0 = c1 = c2 = 0;
2832     (void)p0;     // Suppress set but not used warning.
2833
2834     // Loop runs once per "significant" character position in the input text.
2835     for (;;) {
2836         // Move all of the positions forward in the input string.
2837         p0 = p1;  c0 = c1;
2838         p1 = p2;  c1 = c2;
2839         p2 = p3;  c2 = c3;
2840
2841         // Advancd p3 by    X(Extend | Format)*   Rule 4
2842         p3 = moveForward(p3);
2843         c3 = cAt(p3);
2844
2845         // Rule (3)  CR x LF
2846         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2847             continue;
2848         }
2849
2850         // Rule (4).   Sep  <break>
2851         if (fSepSet->contains(c1)) {
2852             p2 = p1+1;   // Separators don't combine with Extend or Format.
2853             break;
2854         }
2855
2856         if (p2 >= fText->length()) {
2857             // Reached end of string.  Always a break position.
2858             break;
2859         }
2860
2861         if (p2 == prevPos) {
2862             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2863             continue;
2864         }
2865
2866         // Rule (6).   ATerm x Numeric
2867         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2868             continue;
2869         }
2870
2871         // Rule (7).  (Upper | Lower) ATerm  x  Uppper
2872         if ((fUpperSet->contains(c0) || fLowerSet->contains(c0)) &&
2873                 fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2874             continue;
2875         }
2876
2877         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2878         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2879         //                  note to the Unicode 5.0 documents.
2880         int p8 = p1;
2881         while (fSpSet->contains(cAt(p8))) {
2882             p8 = moveBack(p8);
2883         }
2884         while (fCloseSet->contains(cAt(p8))) {
2885             p8 = moveBack(p8);
2886         }
2887         if (fATermSet->contains(cAt(p8))) {
2888             p8=p2;
2889             for (;;) {
2890                 c = cAt(p8);
2891                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2892                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2893                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2894                     break;
2895                 }
2896                 p8 = moveForward(p8);
2897             }
2898             if (fLowerSet->contains(cAt(p8))) {
2899                 continue;
2900             }
2901         }
2902
2903         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2904         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2905             p8 = p1;
2906             while (fSpSet->contains(cAt(p8))) {
2907                 p8 = moveBack(p8);
2908             }
2909             while (fCloseSet->contains(cAt(p8))) {
2910                 p8 = moveBack(p8);
2911             }
2912             c = cAt(p8);
2913             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2914                 continue;
2915             }
2916         }
2917
2918         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2919         int p9 = p1;
2920         while (fCloseSet->contains(cAt(p9))) {
2921             p9 = moveBack(p9);
2922         }
2923         c = cAt(p9);
2924         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2925             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2926                 continue;
2927             }
2928         }
2929
2930         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2931         int p10 = p1;
2932         while (fSpSet->contains(cAt(p10))) {
2933             p10 = moveBack(p10);
2934         }
2935         while (fCloseSet->contains(cAt(p10))) {
2936             p10 = moveBack(p10);
2937         }
2938         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2939             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2940                 continue;
2941             }
2942         }
2943
2944         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2945         int p11 = p1;
2946         if (fSepSet->contains(cAt(p11))) {
2947             p11 = moveBack(p11);
2948         }
2949         while (fSpSet->contains(cAt(p11))) {
2950             p11 = moveBack(p11);
2951         }
2952         while (fCloseSet->contains(cAt(p11))) {
2953             p11 = moveBack(p11);
2954         }
2955         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2956             break;
2957         }
2958
2959         //  Rule (12)  Any x Any
2960         continue;
2961     }
2962     breakPos = p2;
2963     return breakPos;
2964 }
2965
2966 RBBISentMonkey::~RBBISentMonkey() {
2967     delete fSets;
2968     delete fSepSet;
2969     delete fFormatSet;
2970     delete fSpSet;
2971     delete fLowerSet;
2972     delete fUpperSet;
2973     delete fOLetterSet;
2974     delete fNumericSet;
2975     delete fATermSet;
2976     delete fSContinueSet;
2977     delete fSTermSet;
2978     delete fCloseSet;
2979     delete fOtherSet;
2980     delete fExtendSet;
2981 }
2982
2983
2984
2985 //-------------------------------------------------------------------------------------------
2986 //
2987 //  RBBILineMonkey
2988 //
2989 //-------------------------------------------------------------------------------------------
2990
2991 class RBBILineMonkey: public RBBIMonkeyKind {
2992 public:
2993     RBBILineMonkey();
2994     virtual          ~RBBILineMonkey();
2995     virtual  UVector *charClasses();
2996     virtual  void     setText(const UnicodeString &s);
2997     virtual  int32_t  next(int32_t i);
2998     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2999 private:
3000     UVector      *fSets;
3001
3002     UnicodeSet  *fBK;
3003     UnicodeSet  *fCR;
3004     UnicodeSet  *fLF;
3005     UnicodeSet  *fCM;
3006     UnicodeSet  *fNL;
3007     UnicodeSet  *fSG;
3008     UnicodeSet  *fWJ;
3009     UnicodeSet  *fZW;
3010     UnicodeSet  *fGL;
3011     UnicodeSet  *fCB;
3012     UnicodeSet  *fSP;
3013     UnicodeSet  *fB2;
3014     UnicodeSet  *fBA;
3015     UnicodeSet  *fBB;
3016     UnicodeSet  *fHY;
3017     UnicodeSet  *fH2;
3018     UnicodeSet  *fH3;
3019     UnicodeSet  *fCL;
3020     UnicodeSet  *fCP;
3021     UnicodeSet  *fEX;
3022     UnicodeSet  *fIN;
3023     UnicodeSet  *fJL;
3024     UnicodeSet  *fJV;
3025     UnicodeSet  *fJT;
3026     UnicodeSet  *fNS;
3027     UnicodeSet  *fOP;
3028     UnicodeSet  *fQU;
3029     UnicodeSet  *fIS;
3030     UnicodeSet  *fNU;
3031     UnicodeSet  *fPO;
3032     UnicodeSet  *fPR;
3033     UnicodeSet  *fSY;
3034     UnicodeSet  *fAI;
3035     UnicodeSet  *fAL;
3036     UnicodeSet  *fCJ;
3037     UnicodeSet  *fHL;
3038     UnicodeSet  *fID;
3039     UnicodeSet  *fRI;
3040     UnicodeSet  *fXX;
3041     UnicodeSet  *fEB;
3042     UnicodeSet  *fEM;
3043     UnicodeSet  *fZJ;
3044
3045     BreakIterator        *fCharBI;
3046     const UnicodeString  *fText;
3047     RegexMatcher         *fNumberMatcher;
3048 };
3049
3050 RBBILineMonkey::RBBILineMonkey() :
3051     RBBIMonkeyKind(),
3052     fSets(NULL),
3053
3054     fCharBI(NULL),
3055     fText(NULL),
3056     fNumberMatcher(NULL)
3057
3058 {
3059     if (U_FAILURE(deferredStatus)) {
3060         return;
3061     }
3062
3063     UErrorCode  status = U_ZERO_ERROR;
3064
3065     fSets  = new UVector(status);
3066
3067     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3068     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3069     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3070     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3071     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3072     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3073     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3074     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3075     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3076     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3077     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3078     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3079     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3080     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3081     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3082     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3083     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3084     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3085     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3086     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3087     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3088     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3089     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3090     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3091     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3092     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3093     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3094     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3095     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3096     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3097     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3098     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3099     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3100     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
3101     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
3102     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3103     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
3104     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3105     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3106     fEB    = new UnicodeSet(UnicodeString(
3107                 "[\\u261D\\u26F9\\u270A-\\u270D\\U0001F385\\U0001F3C2-\\U0001F3C4\\U0001F3C7\\U0001F3CA-\\U0001F3CC"
3108                 "\\U0001F442-\\U0001F443\\U0001F446-\\U0001F450\\U0001F466-\\U0001F478\\U0001F47C"
3109                 "\\U0001F481-\\U0001F483\\U0001F485-\\U0001F487\\U0001F4AA\\U0001F574-\\U0001F575\\U0001F57A\\U0001F590\\U0001F595-\\U0001F596"
3110                 "\\U0001F645-\\U0001F647\\U0001F64B-\\U0001F64F\\U0001F6A3\\U0001F6B4-\\U0001F6B6\\U0001F6C0\\U0001F6CC"
3111                 "\\U0001F918-\\U0001F91E\\U0001F926\\U0001F930\\U0001F933-\\U0001F939\\U0001F93C-\\U0001F93E]"), status);
3112     fEM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\U0001F3FB-\\U0001F3FF]"), status);
3113     fZJ    = new UnicodeSet((UChar32)0x200D, (UChar32)0x200D);
3114
3115     if (U_FAILURE(status)) {
3116         deferredStatus = status;
3117         return;
3118     }
3119
3120     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3121     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3122     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3123
3124     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
3125
3126     fID->addAll(*fEB);     // Emoji Base and Emoji Modifier behave as ID.
3127     fID->addAll(*fEM);
3128     fAL->removeAll(*fEM);
3129
3130
3131     fAL->remove((UChar32)0x2695);   // move u2695 from Al to Id
3132     fAL->remove((UChar32)0x2696);   // move u2696 from Al to Id
3133     fAL->remove((UChar32)0x2764);   // Emoji Proposal: move u2764 from Al to Id
3134     fAI->remove((UChar32)0x2640);   // new ZWJ seqs
3135     fAI->remove((UChar32)0x2642);   // new ZWJ seqs
3136     fID->add((UChar32)0x2695);
3137     fID->add((UChar32)0x2696);
3138     fID->add((UChar32)0x2764);
3139     fID->add((UChar32)0x2640);
3140     fID->add((UChar32)0x2642);
3141
3142     fSets->addElement(fBK, status);
3143     fSets->addElement(fCR, status);
3144     fSets->addElement(fLF, status);
3145     fSets->addElement(fCM, status);
3146     fSets->addElement(fNL, status);
3147     fSets->addElement(fWJ, status);
3148     fSets->addElement(fZW, status);
3149     fSets->addElement(fGL, status);
3150     fSets->addElement(fCB, status);
3151     fSets->addElement(fSP, status);
3152     fSets->addElement(fB2, status);
3153     fSets->addElement(fBA, status);
3154     fSets->addElement(fBB, status);
3155     fSets->addElement(fHY, status);
3156     fSets->addElement(fH2, status);
3157     fSets->addElement(fH3, status);
3158     fSets->addElement(fCL, status);
3159     fSets->addElement(fCP, status);
3160     fSets->addElement(fEX, status);
3161     fSets->addElement(fIN, status);
3162     fSets->addElement(fJL, status);
3163     fSets->addElement(fJT, status);
3164     fSets->addElement(fJV, status);
3165     fSets->addElement(fNS, status);
3166     fSets->addElement(fOP, status);
3167     fSets->addElement(fQU, status);
3168     fSets->addElement(fIS, status);
3169     fSets->addElement(fNU, status);
3170     fSets->addElement(fPO, status);
3171     fSets->addElement(fPR, status);
3172     fSets->addElement(fSY, status);
3173     fSets->addElement(fAI, status);
3174     fSets->addElement(fAL, status);
3175     fSets->addElement(fHL, status);
3176     fSets->addElement(fID, status);
3177     fSets->addElement(fWJ, status);
3178     fSets->addElement(fRI, status);
3179     fSets->addElement(fSG, status);
3180     fSets->addElement(fEB, status);
3181     fSets->addElement(fEM, status);
3182     fSets->addElement(fZJ, status);
3183
3184     const char *rules =
3185             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3186             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3187             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3188             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3189             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3190             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3191
3192     fNumberMatcher = new RegexMatcher(
3193         UnicodeString(rules, -1, US_INV), 0, status);
3194
3195     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3196
3197     if (U_FAILURE(status)) {
3198         deferredStatus = status;
3199     }
3200 }
3201
3202
3203 void RBBILineMonkey::setText(const UnicodeString &s) {
3204     fText       = &s;
3205     fCharBI->setText(s);
3206     fNumberMatcher->reset(s);
3207 }
3208
3209 //
3210 //  rule9Adjust
3211 //     Line Break TR rules 9 and 10 implementation.
3212 //     This deals with combining marks and other sequences that
3213 //     that must be treated as if they were something other than what they actually are.
3214 //
3215 //     This is factored out into a separate function because it must be applied twice for
3216 //     each potential break, once to the chars before the position being checked, then
3217 //     again to the text following the possible break.
3218 //
3219 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3220     if (pos == -1) {
3221         // Invalid initial position.  Happens during the warmup iteration of the
3222         //   main loop in next().
3223         return;
3224     }
3225
3226     int32_t  nPos = *nextPos;
3227
3228     // LB 9  Keep combining sequences together.
3229     //  advance over any CM class chars.  Note that Line Break CM is different
3230     //  from the normal Grapheme Extend property.
3231     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3232           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3233         for (;;) {
3234             *nextChar = fText->char32At(nPos);
3235             if (!fCM->contains(*nextChar)) {
3236                 break;
3237             }
3238             nPos = fText->moveIndex32(nPos, 1);
3239         }
3240     }
3241
3242
3243     // LB 9 Treat X CM* as if it were x.
3244     //       No explicit action required.
3245
3246     // LB 10  Treat any remaining combining mark as AL
3247     if (fCM->contains(*posChar)) {
3248         *posChar = 0x41;   // thisChar = 'A';
3249     }
3250
3251     // Push the updated nextPos and nextChar back to our caller.
3252     // This only makes a difference if posChar got bigger by consuming a
3253     // combining sequence.
3254     *nextPos  = nPos;
3255     *nextChar = fText->char32At(nPos);
3256 }
3257
3258
3259
3260 int32_t RBBILineMonkey::next(int32_t startPos) {
3261     UErrorCode status = U_ZERO_ERROR;
3262     int32_t    pos;       //  Index of the char following a potential break position
3263     UChar32    thisChar;  //  Character at above position "pos"
3264
3265     int32_t    prevPos;   //  Index of the char preceding a potential break position
3266     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3267                           //   and thisChar may not be adjacent because combining
3268                           //   characters between them will be ignored.
3269
3270     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3271     UChar32    prevCharX2;
3272
3273     int32_t    nextPos;   //  Index of the next character following pos.
3274                           //     Usually skips over combining marks.
3275     int32_t    nextCPPos; //  Index of the code point following "pos."
3276                           //     May point to a combining mark.
3277     int32_t    tPos;      //  temp value.
3278     UChar32    c;
3279
3280     if (U_FAILURE(deferredStatus)) {
3281         return -1;
3282     }
3283
3284     if (startPos >= fText->length()) {
3285         return -1;
3286     }
3287
3288
3289     // Initial values for loop.  Loop will run the first time without finding breaks,
3290     //                           while the invalid values shift out and the "this" and
3291     //                           "prev" positions are filled in with good values.
3292     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3293     thisChar = prevChar  = prevCharX2 = 0;
3294     nextPos  = nextCPPos = startPos;
3295
3296
3297     // Loop runs once per position in the test text, until a break position
3298     //  is found.
3299     for (;;) {
3300         prevPosX2 = prevPos;
3301         prevCharX2 = prevChar;
3302
3303         prevPos   = pos;
3304         prevChar  = thisChar;
3305
3306         pos       = nextPos;
3307         thisChar  = fText->char32At(pos);
3308
3309         nextCPPos = fText->moveIndex32(pos, 1);
3310         nextPos   = nextCPPos;
3311
3312         // Rule LB2 - Break at end of text.
3313         if (pos >= fText->length()) {
3314             break;
3315         }
3316
3317         // Rule LB 9 - adjust for combining sequences.
3318         //             We do this one out-of-order because the adjustment does not change anything
3319         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3320         //             be applied.
3321         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3322         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3323         c = fText->char32At(nextPos);
3324         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3325
3326         // If the loop is still warming up - if we haven't shifted the initial
3327         //   -1 positions out of prevPos yet - loop back to advance the
3328         //    position in the input without any further looking for breaks.
3329         if (prevPos == -1) {
3330             continue;
3331         }
3332
3333         // LB 4  Always break after hard line breaks,
3334         if (fBK->contains(prevChar)) {
3335             break;
3336         }
3337
3338         // LB 5  Break after CR, LF, NL, but not inside CR LF
3339         if (prevChar == 0x0d && thisChar == 0x0a) {
3340             continue;
3341         }
3342         if (prevChar == 0x0d ||
3343             prevChar == 0x0a ||
3344             prevChar == 0x85)  {
3345             break;
3346         }
3347
3348         // LB 6  Don't break before hard line breaks
3349         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3350             fBK->contains(thisChar)) {
3351                 continue;
3352         }
3353
3354
3355         // LB 7  Don't break before spaces or zero-width space.
3356         if (fSP->contains(thisChar)) {
3357             continue;
3358         }
3359
3360         if (fZW->contains(thisChar)) {
3361             continue;
3362         }
3363
3364         // LB 8  Break after zero width space
3365         if (fZW->contains(prevChar)) {
3366             break;
3367         }
3368
3369         // LB 8a ZJ x ID
3370         //       The monkey test's way of ignoring combining characters doesn't work
3371         //       for this rule. ZJ is also a CM. Need to get the actual character
3372         //       preceding "thisChar", not ignoring combining marks, possibly ZJ.
3373         {
3374             int32_t prevIdx = fText->moveIndex32(pos, -1);
3375             UChar32 prevC = fText->char32At(prevIdx);
3376             if (fZJ->contains(prevC) && fID->contains(thisChar)) {
3377                 continue;
3378             }
3379         }
3380
3381         // LB 9, 10  Already done, at top of loop.
3382         //
3383
3384
3385         // LB 11  Do not break before or after WORD JOINER and related characters.
3386         //    x  WJ
3387         //    WJ  x
3388         //
3389         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3390             continue;
3391         }
3392
3393         // LB 12
3394         //    GL  x
3395         if (fGL->contains(prevChar)) {
3396             continue;
3397         }
3398
3399         // LB 12a
3400         //    [^SP BA HY] x GL
3401         if (!(fSP->contains(prevChar) ||
3402               fBA->contains(prevChar) ||
3403               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3404             continue;
3405         }
3406
3407
3408
3409         // LB 13  Don't break before closings.
3410         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3411         //        fall into LB 17 and the more general number regular expression.
3412         //
3413         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3414             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3415                                          fEX->contains(thisChar)  ||
3416             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3417             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3418             continue;
3419         }
3420
3421         // LB 14 Don't break after OP SP*
3422         //       Scan backwards, checking for this sequence.
3423         //       The OP char could include combining marks, so we actually check for
3424         //           OP CM* SP*
3425         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3426         //       sequence into a ID char, so before scanning back through spaces,
3427         //       verify that prevChar is indeed a space.  The prevChar variable
3428         //       may differ from fText[prevPos]
3429         tPos = prevPos;
3430         if (fSP->contains(prevChar)) {
3431             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3432                 tPos=fText->moveIndex32(tPos, -1);
3433             }
3434         }
3435         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3436             tPos=fText->moveIndex32(tPos, -1);
3437         }
3438         if (fOP->contains(fText->char32At(tPos))) {
3439             continue;
3440         }
3441
3442
3443         // LB 15    QU SP* x OP
3444         if (fOP->contains(thisChar)) {
3445             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3446             int tPos = prevPos;
3447             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3448                 tPos = fText->moveIndex32(tPos, -1);
3449             }
3450             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3451                 tPos = fText->moveIndex32(tPos, -1);
3452             }
3453             if (fQU->contains(fText->char32At(tPos))) {
3454                 continue;
3455             }
3456         }
3457
3458
3459
3460         // LB 16   (CL | CP) SP* x NS
3461         //    Scan backwards for SP* CM* (CL | CP)
3462         if (fNS->contains(thisChar)) {
3463             int tPos = prevPos;
3464             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3465                 tPos = fText->moveIndex32(tPos, -1);
3466             }
3467             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3468                 tPos = fText->moveIndex32(tPos, -1);
3469             }
3470             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3471                 continue;
3472             }
3473         }
3474
3475
3476         // LB 17        B2 SP* x B2
3477         if (fB2->contains(thisChar)) {
3478             //  Scan backwards, checking for the B2 CM* SP* sequence.
3479             tPos = prevPos;
3480             if (fSP->contains(prevChar)) {
3481                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3482                     tPos=fText->moveIndex32(tPos, -1);
3483                 }
3484             }
3485             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3486                 tPos=fText->moveIndex32(tPos, -1);
3487             }
3488             if (fB2->contains(fText->char32At(tPos))) {
3489                 continue;
3490             }
3491         }
3492
3493
3494         // LB 18    break after space
3495         if (fSP->contains(prevChar)) {
3496             break;
3497         }
3498
3499         // LB 19
3500         //    x   QU
3501         //    QU  x
3502         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3503             continue;
3504         }
3505
3506         // LB 20  Break around a CB
3507         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3508             break;
3509         }
3510
3511         // LB 21
3512         if (fBA->contains(thisChar) ||
3513             fHY->contains(thisChar) ||
3514             fNS->contains(thisChar) ||
3515             fBB->contains(prevChar) )   {
3516             continue;
3517         }
3518
3519         // LB 21a
3520         //   HL (HY | BA) x
3521         if (fHL->contains(prevCharX2) &&
3522                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3523             continue;
3524         }
3525
3526         // LB 21b
3527         //   SY x HL
3528         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3529             continue;
3530         }
3531
3532         // LB 22
3533         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3534             (fEX->contains(prevChar) && fIN->contains(thisChar)) ||
3535             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3536             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3537             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3538             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3539             continue;
3540         }
3541
3542
3543         // LB 23    ID x PO
3544         //          AL x NU
3545         //          HL x NU
3546         //          NU x AL
3547         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3548             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3549             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3550             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3551             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3552             continue;
3553         }
3554
3555         // LB 24  Do not break between prefix and letters or ideographs.
3556         //        PR x ID
3557         //        PR x (AL | HL)
3558         //        PO x (AL | HL)
3559         //        (AL | HL) x PR        // Apple early addition
3560         //        (AL | HL) x PO        // Apple early addition
3561         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3562             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3563             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3564             ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPR->contains(thisChar)) ||
3565             ((fAL->contains(prevChar) || fHL->contains(prevChar)) && fPO->contains(thisChar)) ) {
3566             continue;
3567         }
3568
3569
3570
3571         // LB 25    Numbers
3572         if (fNumberMatcher->lookingAt(prevPos, status)) {
3573             if (U_FAILURE(status)) {
3574                 break;
3575             }
3576             // Matched a number.  But could have been just a single digit, which would
3577             //    not represent a "no break here" between prevChar and thisChar
3578             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3579             if (numEndIdx > pos) {
3580                 // Number match includes at least our two chars being checked
3581                 if (numEndIdx > nextPos) {
3582                     // Number match includes additional chars.  Update pos and nextPos
3583                     //   so that next loop iteration will continue at the end of the number,
3584                     //   checking for breaks between last char in number & whatever follows.
3585                     pos = nextPos = numEndIdx;
3586                     do {
3587                         pos = fText->moveIndex32(pos, -1);
3588                         thisChar = fText->char32At(pos);
3589                     } while (fCM->contains(thisChar));
3590                 }
3591                 continue;
3592             }
3593         }
3594
3595
3596         // LB 26 Do not break a Korean syllable.
3597         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3598                                         fJV->contains(thisChar) ||
3599                                         fH2->contains(thisChar) ||
3600                                         fH3->contains(thisChar))) {
3601                                             continue;
3602                                         }
3603
3604         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3605             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3606                 continue;
3607         }
3608
3609         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3610             fJT->contains(thisChar)) {
3611                 continue;
3612         }
3613
3614         // LB 27 Treat a Korean Syllable Block the same as ID.
3615         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3616             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3617             fIN->contains(thisChar)) {
3618                 continue;
3619             }
3620         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3621             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3622             fPO->contains(thisChar)) {
3623                 continue;
3624             }
3625         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3626             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3627                 continue;
3628             }
3629
3630
3631
3632         // LB 28  Do not break between alphabetics ("at").
3633         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3634             continue;
3635         }
3636
3637         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3638         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3639             continue;
3640         }
3641
3642         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3643         //          (AL | NU) x OP
3644         //          CP x (AL | NU)
3645         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3646             continue;
3647         }
3648         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3649             continue;
3650         }
3651
3652         // LB30a    RI RI <break> RI
3653         //             RI    x    RI
3654         if (fRI->contains(prevCharX2) && fRI->contains(prevChar) && fRI->contains(thisChar)) {
3655             break;
3656         }
3657         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3658             continue;
3659         }
3660
3661         // LB30b    Emoji Base x Emoji Modifier
3662         if (fEB->contains(prevChar) && fEM->contains(thisChar)) {
3663             continue;
3664         }
3665
3666         // LB 31    Break everywhere else
3667         break;
3668
3669     }
3670
3671     return pos;
3672 }
3673
3674
3675 UVector  *RBBILineMonkey::charClasses() {
3676     return fSets;
3677 }
3678
3679
3680 RBBILineMonkey::~RBBILineMonkey() {
3681     delete fSets;
3682
3683     delete fBK;
3684     delete fCR;
3685     delete fLF;
3686     delete fCM;
3687     delete fNL;
3688     delete fWJ;
3689     delete fZW;
3690     delete fGL;
3691     delete fCB;
3692     delete fSP;
3693     delete fB2;
3694     delete fBA;
3695     delete fBB;
3696     delete fHY;
3697     delete fH2;
3698     delete fH3;
3699     delete fCL;
3700     delete fCP;
3701     delete fEX;
3702     delete fIN;
3703     delete fJL;
3704     delete fJV;
3705     delete fJT;
3706     delete fNS;
3707     delete fOP;
3708     delete fQU;
3709     delete fIS;
3710     delete fNU;
3711     delete fPO;
3712     delete fPR;
3713     delete fSY;
3714     delete fAI;
3715     delete fAL;
3716     delete fCJ;
3717     delete fHL;
3718     delete fID;
3719     delete fRI;
3720     delete fSG;
3721     delete fXX;
3722     delete fEB;
3723     delete fEM;
3724     delete fZJ;
3725
3726     delete fCharBI;
3727     delete fNumberMatcher;
3728 }
3729
3730
3731 //-------------------------------------------------------------------------------------------
3732 //
3733 //   TestMonkey
3734 //
3735 //     params
3736 //       seed=nnnnn        Random number starting seed.
3737 //                         Setting the seed allows errors to be reproduced.
3738 //       loop=nnn          Looping count.  Controls running time.
3739 //                         -1:  run forever.
3740 //                          0 or greater:  run length.
3741 //
3742 //       type = char | word | line | sent | title
3743 //
3744 //  Example:
3745 //     intltest  rbbi/RBBITest/TestMonkey@"type=line loop=-1"
3746 //
3747 //-------------------------------------------------------------------------------------------
3748
3749 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3750     int32_t val = defaultVal;
3751     name.append(" *= *(-?\\d+)");
3752     UErrorCode status = U_ZERO_ERROR;
3753     RegexMatcher m(name, params, 0, status);
3754     if (m.find()) {
3755         // The param exists.  Convert the string to an int.
3756         char valString[100];
3757         int32_t paramLength = m.end(1, status) - m.start(1, status);
3758         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3759             paramLength = (int32_t)(sizeof(valString)-2);
3760         }
3761         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3762         val = strtol(valString,  NULL, 10);
3763
3764         // Delete this parameter from the params string.
3765         m.reset();
3766         params = m.replaceFirst("", status);
3767     }
3768     U_ASSERT(U_SUCCESS(status));
3769     return val;
3770 }
3771 #endif
3772
3773 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3774 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3775                                     BreakIterator *bi,
3776                                     int expected[],
3777                                     int expectedcount)
3778 {
3779     int count = 0;
3780     int i = 0;
3781     int forward[50];
3782     bi->setText(ustr);
3783     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3784         forward[count] = i;
3785         if (count < expectedcount && expected[count] != i) {
3786             test->errln("break forward test failed: expected %d but got %d",
3787                         expected[count], i);
3788             break;
3789         }
3790         count ++;
3791     }
3792     if (count != expectedcount) {
3793         printStringBreaks(ustr, expected, expectedcount);
3794         test->errln("break forward test failed: missed %d match",
3795                     expectedcount - count);
3796         return;
3797     }
3798     // testing boundaries
3799     for (i = 1; i < expectedcount; i ++) {
3800         int j = expected[i - 1];
3801         if (!bi->isBoundary(j)) {
3802             printStringBreaks(ustr, expected, expectedcount);
3803             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3804             return;
3805         }
3806         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3807             if (bi->isBoundary(j)) {
3808                 printStringBreaks(ustr, expected, expectedcount);
3809                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3810                 return;
3811             }
3812         }
3813     }
3814
3815     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3816         count --;
3817         if (forward[count] != i) {
3818             printStringBreaks(ustr, expected, expectedcount);
3819             test->errln("happy break test previous() failed: expected %d but got %d",
3820                         forward[count], i);
3821             break;
3822         }
3823     }
3824     if (count != 0) {
3825         printStringBreaks(ustr, expected, expectedcount);
3826         test->errln("break test previous() failed: missed a match");
3827         return;
3828     }
3829
3830     // testing preceding
3831     for (i = 0; i < expectedcount - 1; i ++) {
3832         // int j = expected[i] + 1;
3833         int j = ustr.moveIndex32(expected[i], 1);
3834         for (; j <= expected[i + 1]; j ++) {
3835             if (bi->preceding(j) != expected[i]) {
3836                 printStringBreaks(ustr, expected, expectedcount);
3837                 test->errln("preceding(): Not expecting boundary at position %d", j);
3838                 return;
3839             }
3840         }
3841     }
3842 }
3843 #endif
3844
3845 void RBBITest::TestWordBreaks(void)
3846 {
3847 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3848
3849     Locale        locale("en");
3850     UErrorCode    status = U_ZERO_ERROR;
3851     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3852     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3853     // Replaced any C+J characters in a row with a random sequence of characters
3854     // of the same length to make our C+J segmentation not get in the way.
3855     static const char *strlist[] =
3856     {
3857     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3858     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3859     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3860     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3861     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3862     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3863     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3864     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3865     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3866     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3867     "\\u2027\\U000e0067\\u0a47\\u00b7",
3868     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3869     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3870     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3871     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3872     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3873     "\\u0027\\u11af\\U000e0057\\u0602",
3874     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3875     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3876     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3877     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3878     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3879     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3880     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3881     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3882     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3883     "\\u18f4\\U000e0049\\u20e7\\u2027",
3884     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3885     "\\ua183\\u102d\\u0bec\\u003a",
3886     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3887     "\\u003a\\u0e57\\u0fad\\u002e",
3888     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3889     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3890     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3891     "\\u003a\\u0664\\u00b7\\u1fba",
3892     "\\u003b\\u0027\\u00b7\\u47a3",
3893     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3894     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3895     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3896     };
3897     int loop;
3898     if (U_FAILURE(status)) {
3899         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3900         return;
3901     }
3902     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3903         // printf("looping %d\n", loop);
3904         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3905         // RBBICharMonkey monkey;
3906         RBBIWordMonkey monkey;
3907
3908         int expected[50];
3909         int expectedcount = 0;
3910
3911         monkey.setText(ustr);
3912         int i;
3913         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3914             expected[expectedcount ++] = i;
3915         }
3916
3917         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3918     }
3919     delete bi;
3920 #endif
3921 }
3922
3923 void RBBITest::TestWordBoundary(void)
3924 {
3925     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3926     Locale        locale("en");
3927     UErrorCode    status = U_ZERO_ERROR;
3928     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3929     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3930     UChar         str[50];
3931     static const char *strlist[] =
3932     {
3933     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3934     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3935     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3936     "\\u2027\\U000e0067\\u0a47\\u00b7",
3937     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3938     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3939     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3940     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3941     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3942     "\\u0027\\u11af\\U000e0057\\u0602",
3943     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3944     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3945     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3946     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3947     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3948     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3949     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3950     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3951     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3952     "\\u58f4\\U000e0049\\u20e7\\u2027",
3953     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3954     "\\ua183\\u102d\\u0bec\\u003a",
3955     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3956     "\\u003a\\u0e57\\u0fad\\u002e",
3957     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3958     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3959     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3960     "\\u003a\\u0664\\u00b7\\u1fba",
3961     "\\u003b\\u0027\\u00b7\\u47a3",
3962     };
3963     int loop;
3964     if (U_FAILURE(status)) {
3965         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3966         return;
3967     }
3968     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
3969         // printf("looping %d\n", loop);
3970         u_unescape(strlist[loop], str, 20);
3971         UnicodeString ustr(str);
3972         int forward[50];
3973         int count = 0;
3974
3975         bi->setText(ustr);
3976         int prev = 0;
3977         int i;
3978         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3979             forward[count ++] = i;
3980             if (i > prev) {
3981                 int j;
3982                 for (j = prev + 1; j < i; j ++) {
3983                     if (bi->isBoundary(j)) {
3984                         printStringBreaks(ustr, forward, count);
3985                         errln("happy boundary test failed: expected %d not a boundary",
3986                                j);
3987                         return;
3988                     }
3989                 }
3990             }
3991             if (!bi->isBoundary(i)) {
3992                 printStringBreaks(ustr, forward, count);
3993                 errln("happy boundary test failed: expected %d a boundary",
3994                        i);
3995                 return;
3996             }
3997             prev = i;
3998         }
3999     }
4000     delete bi;
4001 }
4002
4003 void RBBITest::TestLineBreaks(void)
4004 {
4005 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4006     Locale        locale("en");
4007     UErrorCode    status = U_ZERO_ERROR;
4008     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4009     const int32_t  STRSIZE = 50;
4010     UChar         str[STRSIZE];
4011     static const char *strlist[] =
4012     {
4013      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4014      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4015              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4016      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4017              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4018      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4019      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4020      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4021      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4022      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4023      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4024      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4025      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4026      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4027      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4028      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4029      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4030      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4031      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4032      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4033      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4034      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4035      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4036      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4037      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4038      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4039      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4040      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4041      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4042      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4043      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4044      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4045      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4046      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4047      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4048      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4049      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4050      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4051      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4052          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4053     };
4054     int loop;
4055     TEST_ASSERT_SUCCESS(status);
4056     if (U_FAILURE(status)) {
4057         return;
4058     }
4059     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4060         // printf("looping %d\n", loop);
4061         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4062         if (t >= STRSIZE) {
4063             TEST_ASSERT(FALSE);
4064             continue;
4065         }
4066
4067
4068         UnicodeString ustr(str);
4069         RBBILineMonkey monkey;
4070         if (U_FAILURE(monkey.deferredStatus)) {
4071             continue;
4072         }
4073
4074         const int EXPECTEDSIZE = 50;
4075         int expected[EXPECTEDSIZE];
4076         int expectedcount = 0;
4077
4078         monkey.setText(ustr);
4079         int i;
4080         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4081             if (expectedcount >= EXPECTEDSIZE) {
4082                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4083                 return;
4084             }
4085             expected[expectedcount ++] = i;
4086         }
4087
4088         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4089     }
4090     delete bi;
4091 #endif
4092 }
4093
4094 void RBBITest::TestSentBreaks(void)
4095 {
4096 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4097     Locale        locale("en");
4098     UErrorCode    status = U_ZERO_ERROR;
4099     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4100     UChar         str[200];
4101     static const char *strlist[] =
4102     {
4103      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4104      "This\n",
4105      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4106      "\"Sentence ending with a quote.\" Bye.",
4107      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4108      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4109      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4110      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4111      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4112      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4113      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4114              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4115              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4116              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4117      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4118              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4119              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4120              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4121              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4122              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4123     };
4124     int loop;
4125     if (U_FAILURE(status)) {
4126         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4127         return;
4128     }
4129     for (loop = 0; loop < UPRV_LENGTHOF(strlist); loop ++) {
4130         u_unescape(strlist[loop], str, UPRV_LENGTHOF(str));
4131         UnicodeString ustr(str);
4132
4133         RBBISentMonkey monkey;
4134         if (U_FAILURE(monkey.deferredStatus)) {
4135             continue;
4136         }
4137
4138         const int EXPECTEDSIZE = 50;
4139         int expected[EXPECTEDSIZE];
4140         int expectedcount = 0;
4141
4142         monkey.setText(ustr);
4143         int i;
4144         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4145             if (expectedcount >= EXPECTEDSIZE) {
4146                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4147                 return;
4148             }
4149             expected[expectedcount ++] = i;
4150         }
4151
4152         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4153     }
4154     delete bi;
4155 #endif
4156 }
4157
4158 void RBBITest::TestMonkey(char *params) {
4159 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4160
4161     UErrorCode     status    = U_ZERO_ERROR;
4162     int32_t        loopCount = 500;
4163     int32_t        seed      = 1;
4164     UnicodeString  breakType = "all";
4165     Locale         locale("en");
4166     UBool          useUText  = FALSE;
4167
4168     if (quick == FALSE) {
4169         loopCount = 10000;
4170     }
4171
4172     if (params) {
4173         UnicodeString p(params);
4174         loopCount = getIntParam("loop", p, loopCount);
4175         seed      = getIntParam("seed", p, seed);
4176
4177         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4178         if (m.find()) {
4179             breakType = m.group(1, status);
4180             m.reset();
4181             p = m.replaceFirst("", status);
4182         }
4183
4184         RegexMatcher u(" *utext", p, 0, status);
4185         if (u.find()) {
4186             useUText = TRUE;
4187             u.reset();
4188             p = u.replaceFirst("", status);
4189         }
4190
4191
4192         // m.reset(p);
4193         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4194             // Each option is stripped out of the option string as it is processed.
4195             // All options have been checked.  The option string should have been completely emptied..
4196             char buf[100];
4197             p.extract(buf, sizeof(buf), NULL, status);
4198             buf[sizeof(buf)-1] = 0;
4199             errln("Unrecognized or extra parameter:  %s\n", buf);
4200             return;
4201         }
4202
4203     }
4204
4205     if (breakType == "char" || breakType == "all") {
4206         RBBICharMonkey  m;
4207         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4208         if (U_SUCCESS(status)) {
4209             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4210             if (breakType == "all" && useUText==FALSE) {
4211                 // Also run a quick test with UText when "all" is specified
4212                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4213             }
4214         }
4215         else {
4216             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4217         }
4218         delete bi;
4219     }
4220
4221     if (breakType == "word" || breakType == "all") {
4222         logln("Word Break Monkey Test");
4223         RBBIWordMonkey  m;
4224         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4225         if (U_SUCCESS(status)) {
4226             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4227         }
4228         else {
4229             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4230         }
4231         delete bi;
4232     }
4233
4234     if (breakType == "line" || breakType == "all") {
4235         logln("Line Break Monkey Test");
4236         RBBILineMonkey  m;
4237         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4238         if (loopCount >= 10) {
4239             loopCount = loopCount / 5;   // Line break runs slower than the others.
4240         }
4241         if (U_SUCCESS(status)) {
4242             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4243         }
4244         else {
4245             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4246         }
4247         delete bi;
4248     }
4249
4250     if (breakType == "sent" || breakType == "all"  ) {
4251         logln("Sentence Break Monkey Test");
4252         RBBISentMonkey  m;
4253         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4254         if (loopCount >= 10) {
4255             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4256         }
4257         if (U_SUCCESS(status)) {
4258             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4259         }
4260         else {
4261             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4262         }
4263         delete bi;
4264     }
4265
4266 #endif
4267 }
4268
4269 //
4270 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4271 //    Parameters:
4272 //       bi      - the break iterator to use
4273 //       mk      - MonkeyKind, abstraction for obtaining expected results
4274 //       name    - Name of test (char, word, etc.) for use in error messages
4275 //       seed    - Seed for starting random number generator (parameter from user)
4276 //       numIterations
4277 //
4278 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4279                          int32_t numIterations, UBool useUText) {
4280
4281 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4282
4283     const int32_t    TESTSTRINGLEN = 500;
4284     UnicodeString    testText;
4285     int32_t          numCharClasses;
4286     UVector          *chClasses;
4287     int              expected[TESTSTRINGLEN*2 + 1];
4288     int              expectedCount = 0;
4289     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4290     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4291     char             reverseBreaks[TESTSTRINGLEN*2+1];
4292     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4293     char             followingBreaks[TESTSTRINGLEN*2+1];
4294     char             precedingBreaks[TESTSTRINGLEN*2+1];
4295     int              i;
4296     int              loopCount = 0;
4297
4298     m_seed = seed;
4299
4300     numCharClasses = mk.charClasses()->size();
4301     chClasses      = mk.charClasses();
4302
4303     // Check for errors that occured during the construction of the MonkeyKind object.
4304     //  Can't report them where they occured because errln() is a method coming from intlTest,
4305     //  and is not visible outside of RBBITest :-(
4306     if (U_FAILURE(mk.deferredStatus)) {
4307         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4308         return;
4309     }
4310
4311     // Verify that the character classes all have at least one member.
4312     for (i=0; i<numCharClasses; i++) {
4313         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4314         if (s == NULL || s->size() == 0) {
4315             errln("Character Class #%d is null or of zero size.", i);
4316             return;
4317         }
4318     }
4319
4320     while (loopCount < numIterations || numIterations == -1) {
4321         if (numIterations == -1 && loopCount % 10 == 0) {
4322             // If test is running in an infinite loop, display a periodic tic so
4323             //   we can tell that it is making progress.
4324             fprintf(stderr, ".");
4325         }
4326         // Save current random number seed, so that we can recreate the random numbers
4327         //   for this loop iteration in event of an error.
4328         seed = m_seed;
4329
4330         // Populate a test string with data.
4331         testText.truncate(0);
4332         for (i=0; i<TESTSTRINGLEN; i++) {
4333             int32_t  aClassNum = m_rand() % numCharClasses;
4334             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4335             int32_t   charIdx = m_rand() % classSet->size();
4336             UChar32   c = classSet->charAt(charIdx);
4337             if (c < 0) {   // TODO:  deal with sets containing strings.
4338                 errln("%s:%d c < 0", __FILE__, __LINE__);
4339                 break;
4340             }
4341             // Do not assemble a supplementary character from randomly generated separate surrogates.
4342             //   (It could be a dictionary character)
4343             if (U16_IS_TRAIL(c) && testText.length() > 0 && U16_IS_LEAD(testText.charAt(testText.length()-1))) {
4344                 continue;
4345             }
4346
4347             testText.append(c);
4348         }
4349
4350         // Calculate the expected results for this test string.
4351         mk.setText(testText);
4352         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4353         expectedBreaks[0] = 1;
4354         int32_t breakPos = 0;
4355         expectedCount = 0;
4356         for (;;) {
4357             breakPos = mk.next(breakPos);
4358             if (breakPos == -1) {
4359                 break;
4360             }
4361             if (breakPos > testText.length()) {
4362                 errln("breakPos > testText.length()");
4363             }
4364             expectedBreaks[breakPos] = 1;
4365             U_ASSERT(expectedCount<testText.length());
4366             expected[expectedCount ++] = breakPos;
4367             (void)expected;   // Set but not used warning.
4368                               // TODO (andy): check it out.
4369         }
4370
4371         // Find the break positions using forward iteration
4372         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4373         if (useUText) {
4374             UErrorCode status = U_ZERO_ERROR;
4375             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4376             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4377             bi->setText(testUText, status);
4378             TEST_ASSERT_SUCCESS(status);
4379             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4380                                       //  This UText can be closed immediately, so long as the
4381                                       //  testText string continues to exist.
4382         } else {
4383             bi->setText(testText);
4384         }
4385
4386         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4387             if (i < 0 || i > testText.length()) {
4388                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4389                 break;
4390             }
4391             forwardBreaks[i] = 1;
4392         }
4393
4394         // Find the break positions using reverse iteration
4395         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4396         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4397             if (i < 0 || i > testText.length()) {
4398                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4399                 break;
4400             }
4401             reverseBreaks[i] = 1;
4402         }
4403
4404         // Find the break positions using isBoundary() tests.
4405         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4406         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4407         for (i=0; i<=testText.length(); i++) {
4408             isBoundaryBreaks[i] = bi->isBoundary(i);
4409         }
4410
4411
4412         // Find the break positions using the following() function.
4413         // printf(".");
4414         memset(followingBreaks, 0, sizeof(followingBreaks));
4415         int32_t   lastBreakPos = 0;
4416         followingBreaks[0] = 1;
4417         for (i=0; i<testText.length(); i++) {
4418             breakPos = bi->following(i);
4419             if (breakPos <= i ||
4420                 breakPos < lastBreakPos ||
4421                 breakPos > testText.length() ||
4422                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4423                 errln("%s break monkey test: "
4424                     "Out of range value returned by BreakIterator::following().\n"
4425                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4426                          name, seed, i, breakPos, lastBreakPos);
4427                 break;
4428             }
4429             followingBreaks[breakPos] = 1;
4430             lastBreakPos = breakPos;
4431         }
4432
4433         // Find the break positions using the preceding() function.
4434         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4435         lastBreakPos = testText.length();
4436         precedingBreaks[testText.length()] = 1;
4437         for (i=testText.length(); i>0; i--) {
4438             breakPos = bi->preceding(i);
4439             if (breakPos >= i ||
4440                 breakPos > lastBreakPos ||
4441                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4442                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4443                 errln("%s break monkey test: "
4444                     "Out of range value returned by BreakIterator::preceding().\n"
4445                     "index=%d;  prev returned %d; lastBreak=%d" ,
4446                     name,  i, breakPos, lastBreakPos);
4447                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4448                     precedingBreaks[i] = 2;   // Forces an error.
4449                 }
4450             } else {
4451                 if (breakPos >= 0) {
4452                     precedingBreaks[breakPos] = 1;
4453                 }
4454                 lastBreakPos = breakPos;
4455             }
4456         }
4457
4458         // Compare the expected and actual results.
4459         for (i=0; i<=testText.length(); i++) {
4460             const char *errorType = NULL;
4461             if  (forwardBreaks[i] != expectedBreaks[i]) {
4462                 errorType = "next()";
4463             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4464                 errorType = "previous()";
4465             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4466                 errorType = "isBoundary()";
4467             } else if (followingBreaks[i] != expectedBreaks[i]) {
4468                 errorType = "following()";
4469             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4470                 errorType = "preceding()";
4471             }
4472
4473
4474             if (errorType != NULL) {
4475                 // Format a range of the test text that includes the failure as
4476                 //  a data item that can be included in the rbbi test data file.
4477
4478                 // Start of the range is the last point where expected and actual results
4479                 //   both agreed that there was a break position.
4480                 int startContext = i;
4481                 int32_t count = 0;
4482                 for (;;) {
4483                     if (startContext==0) { break; }
4484                     startContext --;
4485                     if (expectedBreaks[startContext] != 0) {
4486                         if (count == 2) break;
4487                         count ++;
4488                     }
4489                 }
4490
4491                 // End of range is two expected breaks past the start position.
4492                 int endContext = i + 1;
4493                 int ci;
4494                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4495                     for (;;) {
4496                         if (endContext >= testText.length()) {break;}
4497                         if (expectedBreaks[endContext-1] != 0) {
4498                             if (count == 0) break;
4499                             count --;
4500                         }
4501                         endContext ++;
4502                     }
4503                 }
4504
4505                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4506                 UnicodeString errorText = "<data>";
4507                 /***if (strcmp(errorType, "next()") == 0) {
4508                     startContext = 0;
4509                     endContext = testText.length();
4510
4511                     printStringBreaks(testText, expected, expectedCount);
4512                 }***/
4513
4514                 for (ci=startContext; ci<endContext;) {
4515                     UnicodeString hexChars("0123456789abcdef");
4516                     UChar32  c;
4517                     int      bn;
4518                     c = testText.char32At(ci);
4519                     if (ci == i) {
4520                         // This is the location of the error.
4521                         errorText.append("<?>");
4522                     } else if (expectedBreaks[ci] != 0) {
4523                         // This a non-error expected break position.
4524                         errorText.append("\\");
4525                     }
4526                     if (c < 0x10000) {
4527                         errorText.append("\\u");
4528                         for (bn=12; bn>=0; bn-=4) {
4529                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4530                         }
4531                     } else {
4532                         errorText.append("\\U");
4533                         for (bn=28; bn>=0; bn-=4) {
4534                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4535                         }
4536                     }
4537                     ci = testText.moveIndex32(ci, 1);
4538                 }
4539                 errorText.append("\\");
4540                 errorText.append("</data>\n");
4541
4542                 // Output the error
4543                 char  charErrorTxt[500];
4544                 UErrorCode status = U_ZERO_ERROR;
4545                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4546                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4547                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4548
4549                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4550                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4551                     errorType, seed, i, charErrorTxt);
4552                 break;
4553             }
4554         }
4555
4556         loopCount++;
4557     }
4558 #endif
4559 }
4560
4561
4562 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4563 //             This test checks the initial patch,
4564 //             which is to just keep it from crashing.  Correct word boundaries
4565 //             await a proper fix to the dictionary code.
4566 //
4567 void RBBITest::TestBug5532(void)  {
4568    // Text includes a mixture of Thai and Latin.
4569    const unsigned char utf8Data[] = {
4570            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4571            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4572            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4573            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4574            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4575            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4576            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4577            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4578            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4579            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4580            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4581
4582     UErrorCode status = U_ZERO_ERROR;
4583     UText utext=UTEXT_INITIALIZER;
4584     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4585     TEST_ASSERT_SUCCESS(status);
4586
4587     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4588     TEST_ASSERT_SUCCESS(status);
4589     if (U_SUCCESS(status)) {
4590         bi->setText(&utext, status);
4591         TEST_ASSERT_SUCCESS(status);
4592
4593         int32_t breakCount = 0;
4594         int32_t previousBreak = -1;
4595         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4596             // For now, just make sure that the break iterator doesn't hang.
4597             TEST_ASSERT(previousBreak < bi->current());
4598             previousBreak = bi->current();
4599         }
4600         TEST_ASSERT(breakCount > 0);
4601     }
4602     delete bi;
4603     utext_close(&utext);
4604 }
4605
4606
4607 void RBBITest::TestBug9983(void)  {
4608     UnicodeString text = UnicodeString("\\u002A"  // * Other
4609                                        "\\uFF65"  //   Other
4610                                        "\\u309C"  //   Katakana
4611                                        "\\uFF9F"  //   Extend
4612                                        "\\uFF65"  //   Other
4613                                        "\\u0020"  //   Other
4614                                        "\\u0000").unescape();
4615
4616     UErrorCode status = U_ZERO_ERROR;
4617     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4618         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4619     TEST_ASSERT_SUCCESS(status);
4620     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4621         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4622     TEST_ASSERT_SUCCESS(status);
4623     if (U_FAILURE(status)) {
4624         return;
4625     }
4626     int32_t offset, rstatus, iterationCount;
4627
4628     brkiter->setText(text);
4629     brkiter->last();
4630     iterationCount = 0;
4631     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4632         iterationCount++;
4633         rstatus = brkiter->getRuleStatus();
4634         (void)rstatus;     // Suppress set but not used warning.
4635         if (iterationCount >= 10) {
4636            break;
4637         }
4638     }
4639     TEST_ASSERT(iterationCount == 6);
4640
4641     brkiterPOSIX->setText(text);
4642     brkiterPOSIX->last();
4643     iterationCount = 0;
4644     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4645         iterationCount++;
4646         rstatus = brkiterPOSIX->getRuleStatus();
4647         (void)rstatus;     // Suppress set but not used warning.
4648         if (iterationCount >= 10) {
4649            break;
4650         }
4651     }
4652     TEST_ASSERT(iterationCount == 6);
4653 }
4654
4655
4656 //
4657 //  TestDebug    -  A place-holder test for debugging purposes.
4658 //                  For putting in fragments of other tests that can be invoked
4659 //                  for tracing  without a lot of unwanted extra stuff happening.
4660 //
4661 void RBBITest::TestDebug(void) {
4662 #if 0
4663     UErrorCode   status = U_ZERO_ERROR;
4664     int pos = 0;
4665     int ruleStatus = 0;
4666
4667     RuleBasedBreakIterator* bi =
4668        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4669        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4670        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4671     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4672     // UnicodeString s("Aaa.  Bcd");
4673     s = s.unescape();
4674     bi->setText(s);
4675     UBool r = bi->isBoundary(8);
4676     printf("%s", r?"true":"false");
4677     return;
4678     pos = bi->last();
4679     do {
4680         // ruleStatus = bi->getRuleStatus();
4681         printf("%d\t%d\n", pos, ruleStatus);
4682         pos = bi->previous();
4683     } while (pos != BreakIterator::DONE);
4684 #endif
4685 }
4686
4687 void RBBITest::TestProperties() {
4688     UErrorCode errorCode = U_ZERO_ERROR;
4689     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4690     if (!prependSet.isEmpty()) {
4691         errln(
4692             "[:GCB=Prepend:] is not empty any more. "
4693             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4694             "change this test to the opposite condition.");
4695     }
4696 }
4697
4698 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */