icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2013, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "utypeinfo.h"  // for 'typeid' to work
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_BREAK_ITERATION
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/brkiter.h"
  20 #include "unicode/rbbi.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/utf16.h"
  23 #include "unicode/ucnv.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uniset.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27 #include "unicode/regex.h"
  28 #endif
  29 #include "unicode/ustring.h"
  30 #include "unicode/utext.h"
  31 #include "intltest.h"
  32 #include "rbbitst.h"
  33 #include <string.h>
  34 #include "uvector.h"
  35 #include "uvectr32.h"
  36 #include <string.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include "unicode/numfmt.h"
  40 #include "unicode/uscript.h"
  41
  42 #define TEST_ASSERT(x) {if (!(x)) { \
  43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  44
  45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  47
  48
  49 //---------------------------------------------
  50 // runIndexedTest
  51 //---------------------------------------------
  52
  53
  54 //  Note:  Before adding new tests to this file, check whether the desired test data can
  55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  56 //         it's much less work than writing a new test, diagnostic output in the event of failures
  57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  58 //         will run there as well, without additional effort.
  59
  60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  61 {
  62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  63
  64     switch (index) {
  65 #if !UCONFIG_NO_FILE_IO
  66         case 0: name = "TestBug4153072";
  67             if(exec) TestBug4153072();                         break;
  68 #else
  69         case 0: name = "skip";
  70             break;
  71 #endif
  72
  73         case 1: name = "skip";
  74             break;
  75         case 2: name = "TestStatusReturn";
  76             if(exec) TestStatusReturn();                       break;
  77
  78 #if !UCONFIG_NO_FILE_IO
  79         case 3: name = "TestUnicodeFiles";
  80             if(exec) TestUnicodeFiles();                       break;
  81         case 4: name = "TestEmptyString";
  82             if(exec) TestEmptyString();                        break;
  83 #else
  84         case 3: case 4: name = "skip";
  85             break;
  86 #endif
  87
  88         case 5: name = "TestGetAvailableLocales";
  89             if(exec) TestGetAvailableLocales();                break;
  90
  91         case 6: name = "TestGetDisplayName";
  92             if(exec) TestGetDisplayName();                     break;
  93
  94 #if !UCONFIG_NO_FILE_IO
  95         case 7: name = "TestEndBehaviour";
  96             if(exec) TestEndBehaviour();                       break;
  97         case 8: case 9: case 10: name = "skip";
  98              break;
  99         case 11: name = "TestWordBreaks";
 100              if(exec) TestWordBreaks();                        break;
 101         case 12: name = "TestWordBoundary";
 102              if(exec) TestWordBoundary();                      break;
 103         case 13: name = "TestLineBreaks";
 104              if(exec) TestLineBreaks();                        break;
 105         case 14: name = "TestSentBreaks";
 106              if(exec) TestSentBreaks();                        break;
 107         case 15: name = "TestExtended";
 108              if(exec) TestExtended();                          break;
 109 #else
 110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 111              break;
 112 #endif
 113
 114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 115         case 16:
 116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
 117 #else
 118         case 16:
 119              name = "skip";                                    break;
 120 #endif
 121
 122 #if !UCONFIG_NO_FILE_IO
 123         case 17: name = "TestBug3818";
 124             if(exec) TestBug3818();                            break;
 125 #else
 126         case 17: name = "skip";
 127             break;
 128 #endif
 129
 130         case 18: name = "skip";
 131             break;
 132         case 19: name = "TestDebug";
 133             if(exec) TestDebug();                              break;
 134         case 20: name = "skip";
 135             break;
 136
 137 #if !UCONFIG_NO_FILE_IO
 138         case 21: name = "TestBug5775";
 139             if (exec) TestBug5775();                           break;
 140 #else
 141         case 21: name = "skip";
 142             break;
 143 #endif
 144
 145         case 22: name = "TestBug9983";
 146             if (exec) TestBug9983();                           break;
 147         case 23: name = "TestDictRules";
 148             if (exec) TestDictRules();                         break;
 149         case 24: name = "TestBug5532";
 150             if (exec) TestBug5532();                           break;
 151         default: name = ""; break; //needed to end loop
 152     }
 153 }
 154
 155
 156 //---------------------------------------------------------------------------
 157 //
 158 //   class BITestData   Holds a set of Break iterator test data and results
 159 //                      Includes
 160 //                         - the string data to be broken
 161 //                         - a vector of the expected break positions.
 162 //                         - a vector of source line numbers for the data,
 163 //                               (to help see where errors occured.)
 164 //                         - The expected break tag values.
 165 //                         - Vectors of actual break positions and tag values.
 166 //                         - Functions for comparing actual with expected and
 167 //                            reporting errors.
 168 //
 169 //----------------------------------------------------------------------------
 170 class BITestData {
 171 public:
 172     UnicodeString    fDataToBreak;
 173     UVector          fExpectedBreakPositions;
 174     UVector          fExpectedTags;
 175     UVector          fLineNum;
 176     UVector          fActualBreakPositions;   // Test Results.
 177     UVector          fActualTags;
 178
 179     BITestData(UErrorCode &status);
 180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 181     void             checkResults(const char *heading, RBBITest *test);
 182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 183     void             clearResults();
 184 };
 185
 186 //
 187 // Constructor.
 188 //
 189 BITestData::BITestData(UErrorCode &status)
 190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 191   fActualTags(status)
 192 {
 193 }
 194
 195 //
 196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 197 //                 The macro form collects the line number, which is helpful
 198 //                 when tracking down failures.
 199 //
 200 //                 A null data item is inserted at the start of each test's data
 201 //                  to put the starting zero into the data list.  The position saved for
 202 //                  each non-null item is its ending position.
 203 //
 204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 206     if (U_FAILURE(status)) {return;}
 207     if (data != NULL) {
 208         fDataToBreak.append(CharsToUnicodeString(data));
 209     }
 210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 211     fExpectedTags.addElement(tag, status);
 212     fLineNum.addElement(lineNum, status);
 213 }
 214
 215
 216 //
 217 //  checkResults.   Compare the actual and expected break positions, report any differences.
 218 //
 219 void BITestData::checkResults(const char *heading, RBBITest *test) {
 220     int32_t   expectedIndex = 0;
 221     int32_t   actualIndex = 0;
 222
 223     for (;;) {
 224         // If we've run through both the expected and actual results vectors, we're done.
 225         //   break out of the loop.
 226         if (expectedIndex >= fExpectedBreakPositions.size() &&
 227             actualIndex   >= fActualBreakPositions.size()) {
 228             break;
 229         }
 230
 231
 232         if (expectedIndex >= fExpectedBreakPositions.size()) {
 233             err(heading, test, expectedIndex-1, actualIndex);
 234             actualIndex++;
 235             continue;
 236         }
 237
 238         if (actualIndex >= fActualBreakPositions.size()) {
 239             err(heading, test, expectedIndex, actualIndex-1);
 240             expectedIndex++;
 241             continue;
 242         }
 243
 244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 245             err(heading, test, expectedIndex, actualIndex);
 246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 248                 actualIndex++;
 249             } else {
 250                 expectedIndex++;
 251             }
 252             continue;
 253         }
 254
 255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 257                 heading, fLineNum.elementAt(expectedIndex),
 258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 259         }
 260
 261         actualIndex++;
 262         expectedIndex++;
 263     }
 264 }
 265
 266 //
 267 //  err   -  An error was found.  Report it, along with information about where the
 268 //                                incorrectly broken test data appeared in the source file.
 269 //
 270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 271 {
 272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 274     int32_t   o        = 0;
 275     int32_t   line     = fLineNum.elementAti(expectedIdx);
 276     if (expectedIdx > 0) {
 277         // The line numbers are off by one because a premature break occurs somewhere
 278         //    within the previous item, rather than at the start of the current (expected) item.
 279         //    We want to report the offset of the unexpected break from the start of
 280         //      this previous item.
 281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 282     }
 283     if (actual < expected) {
 284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 285     } else {
 286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 287     }
 288 }
 289
 290
 291 void BITestData::clearResults() {
 292     fActualBreakPositions.removeAllElements();
 293     fActualTags.removeAllElements();
 294 }
 295
 296
 297 //--------------------------------------------------------------------------------------
 298 //
 299 //    RBBITest    constructor and destructor
 300 //
 301 //--------------------------------------------------------------------------------------
 302
 303 RBBITest::RBBITest() {
 304 }
 305
 306
 307 RBBITest::~RBBITest() {
 308 }
 309
 310 //-----------------------------------------------------------------------------------
 311 //
 312 //   Test for status {tag} return value from break rules.
 313 //        TODO:  a more thorough test.
 314 //
 315 //-----------------------------------------------------------------------------------
 316 void RBBITest::TestStatusReturn() {
 317      UnicodeString rulesString1("$Letters = [:L:];\n"
 318                                   "$Numbers = [:N:];\n"
 319                                   "$Letters+{1};\n"
 320                                   "$Numbers+{2};\n"
 321                                   "Help\\ {4}/me\\!;\n"
 322                                   "[^$Letters $Numbers];\n"
 323                                   "!.*;\n", -1, US_INV);
 324      UnicodeString testString1  = "abc123..abc Help me Help me!";
 325                                 // 01234567890123456789012345678
 326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 328
 329      UErrorCode status=U_ZERO_ERROR;
 330      UParseError    parseError;
 331
 332      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 333      if(U_FAILURE(status)) {
 334          dataerrln("FAIL : in construction - %s", u_errorName(status));
 335      } else {
 336          int32_t  pos;
 337          int32_t  i = 0;
 338          bi->setText(testString1);
 339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 340              if (pos != bounds1[i]) {
 341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 342                  break;
 343              }
 344
 345              int tag = bi->getRuleStatus();
 346              if (tag != brkStatus[i]) {
 347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 348                  break;
 349              }
 350              i++;
 351          }
 352      }
 353      delete bi;
 354 }
 355
 356
 357 static void printStringBreaks(UnicodeString ustr, int expected[],
 358                               int expectedcount)
 359 {
 360     UErrorCode status = U_ZERO_ERROR;
 361     char name[100];
 362     printf("code    alpha extend alphanum type word sent line name\n");
 363     int j;
 364     for (j = 0; j < ustr.length(); j ++) {
 365         if (expectedcount > 0) {
 366             int k;
 367             for (k = 0; k < expectedcount; k ++) {
 368                 if (j == expected[k]) {
 369                     printf("------------------------------------------------ %d\n",
 370                            j);
 371                 }
 372             }
 373         }
 374         UChar32 c = ustr.char32At(j);
 375         if (c > 0xffff) {
 376             j ++;
 377         }
 378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 380                            u_isUAlphabetic(c),
 381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 382                            u_isalnum(c),
 383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 384                                                   u_charType(c),
 385                                                   U_SHORT_PROPERTY_NAME),
 386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 387                                                   u_getIntPropertyValue(c,
 388                                                           UCHAR_WORD_BREAK),
 389                                                   U_SHORT_PROPERTY_NAME),
 390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 391                                    u_getIntPropertyValue(c,
 392                                            UCHAR_SENTENCE_BREAK),
 393                                    U_SHORT_PROPERTY_NAME),
 394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 395                                    u_getIntPropertyValue(c,
 396                                            UCHAR_LINE_BREAK),
 397                                    U_SHORT_PROPERTY_NAME),
 398                            name);
 399     }
 400 }
 401
 402
 403 void RBBITest::TestBug3818() {
 404     UErrorCode  status = U_ZERO_ERROR;
 405
 406     // Four Thai words...
 407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 409     UnicodeString  thaiStr(thaiWordData);
 410
 411     RuleBasedBreakIterator* bi =
 412         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 413     if (U_FAILURE(status) || bi == NULL) {
 414         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 415         return;
 416     }
 417     bi->setText(thaiStr);
 418
 419     int32_t  startOfSecondWord = bi->following(1);
 420     if (startOfSecondWord != 4) {
 421         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 422             __FILE__, __LINE__, startOfSecondWord);
 423     }
 424     startOfSecondWord = bi->following(0);
 425     if (startOfSecondWord != 4) {
 426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 427             __FILE__, __LINE__, startOfSecondWord);
 428     }
 429     delete bi;
 430 }
 431
 432 //----------------------------------------------------------------------------
 433 //
 434 // generalIteratorTest      Given a break iterator and a set of test data,
 435 //                          Run the tests and report the results.
 436 //
 437 //----------------------------------------------------------------------------
 438 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 439 {
 440
 441     bi.setText(td.fDataToBreak);
 442
 443     testFirstAndNext(bi, td);
 444
 445     testLastAndPrevious(bi, td);
 446
 447     testFollowing(bi, td);
 448     testPreceding(bi, td);
 449     testIsBoundary(bi, td);
 450     doMultipleSelectionTest(bi, td);
 451 }
 452
 453
 454 //
 455 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 456 //                       kind of loop.
 457 //
 458 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 459 {
 460     UErrorCode  status = U_ZERO_ERROR;
 461     int32_t     p;
 462     int32_t     lastP = -1;
 463     int32_t     tag;
 464
 465     logln("Test first and next");
 466     bi.setText(td.fDataToBreak);
 467     td.clearResults();
 468
 469     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 470         td.fActualBreakPositions.addElement(p, status);  // Save result.
 471         tag = bi.getRuleStatus();
 472         td.fActualTags.addElement(tag, status);
 473         if (p <= lastP) {
 474             // If the iterator is not making forward progress, stop.
 475             //  No need to raise an error here, it'll be detected in the normal check of results.
 476             break;
 477         }
 478         lastP = p;
 479     }
 480     td.checkResults("testFirstAndNext", this);
 481 }
 482
 483
 484 //
 485 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 486 //
 487 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 488 {
 489     UErrorCode  status = U_ZERO_ERROR;
 490     int32_t     p;
 491     int32_t     lastP  = 0x7ffffffe;
 492     int32_t     tag;
 493
 494     logln("Test last and previous");
 495     bi.setText(td.fDataToBreak);
 496     td.clearResults();
 497
 498     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 499         // Save break position.  Insert it at start of vector of results, shoving
 500         //    already-saved results further towards the end.
 501         td.fActualBreakPositions.insertElementAt(p, 0, status);
 502         // bi.previous();   // TODO:  Why does this fix things up????
 503         // bi.next();
 504         tag = bi.getRuleStatus();
 505         td.fActualTags.insertElementAt(tag, 0, status);
 506         if (p >= lastP) {
 507             // If the iterator is not making progress, stop.
 508             //  No need to raise an error here, it'll be detected in the normal check of results.
 509             break;
 510         }
 511         lastP = p;
 512     }
 513     td.checkResults("testLastAndPrevious", this);
 514 }
 515
 516
 517 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 518 {
 519     UErrorCode  status = U_ZERO_ERROR;
 520     int32_t     p;
 521     int32_t     tag;
 522     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 523                                  //   cannot be -1; that is returned for DONE.
 524     int         i;
 525
 526     logln("testFollowing():");
 527     bi.setText(td.fDataToBreak);
 528     td.clearResults();
 529
 530     // Save the starting point, since we won't get that out of following.
 531     p = bi.first();
 532     td.fActualBreakPositions.addElement(p, status);  // Save result.
 533     tag = bi.getRuleStatus();
 534     td.fActualTags.addElement(tag, status);
 535
 536     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 537         p = bi.following(i);
 538         if (p != lastP) {
 539             if (p == RuleBasedBreakIterator::DONE) {
 540                 break;
 541             }
 542             // We've reached a new break position.  Save it.
 543             td.fActualBreakPositions.addElement(p, status);  // Save result.
 544             tag = bi.getRuleStatus();
 545             td.fActualTags.addElement(tag, status);
 546             lastP = p;
 547         }
 548     }
 549     // The loop normally exits by means of the break in the middle.
 550     // Make sure that the index was at the correct position for the break iterator to have
 551     //   returned DONE.
 552     if (i != td.fDataToBreak.length()) {
 553         errln("testFollowing():  iterator returned DONE prematurely.");
 554     }
 555
 556     // Full check of all results.
 557     td.checkResults("testFollowing", this);
 558 }
 559
 560
 561
 562 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 563     UErrorCode  status = U_ZERO_ERROR;
 564     int32_t     p;
 565     int32_t     tag;
 566     int32_t     lastP  = 0x7ffffffe;
 567     int         i;
 568
 569     logln("testPreceding():");
 570     bi.setText(td.fDataToBreak);
 571     td.clearResults();
 572
 573     p = bi.last();
 574     td.fActualBreakPositions.addElement(p, status);
 575     tag = bi.getRuleStatus();
 576     td.fActualTags.addElement(tag, status);
 577
 578     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 579         p = bi.preceding(i);
 580         if (p != lastP) {
 581             if (p == RuleBasedBreakIterator::DONE) {
 582                 break;
 583             }
 584             // We've reached a new break position.  Save it.
 585             td.fActualBreakPositions.insertElementAt(p, 0, status);
 586             lastP = p;
 587             tag = bi.getRuleStatus();
 588             td.fActualTags.insertElementAt(tag, 0, status);
 589         }
 590     }
 591     // The loop normally exits by means of the break in the middle.
 592     // Make sure that the index was at the correct position for the break iterator to have
 593     //   returned DONE.
 594     if (i != 0) {
 595         errln("testPreceding():  iterator returned DONE prematurely.");
 596     }
 597
 598     // Full check of all results.
 599     td.checkResults("testPreceding", this);
 600 }
 601
 602
 603
 604 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 605     UErrorCode  status = U_ZERO_ERROR;
 606     int         i;
 607     int32_t     tag;
 608
 609     logln("testIsBoundary():");
 610     bi.setText(td.fDataToBreak);
 611     td.clearResults();
 612
 613     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 614         if (bi.isBoundary(i)) {
 615             td.fActualBreakPositions.addElement(i, status);  // Save result.
 616             tag = bi.getRuleStatus();
 617             td.fActualTags.addElement(tag, status);
 618         }
 619     }
 620     td.checkResults("testIsBoundary: ", this);
 621 }
 622
 623
 624
 625 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 626 {
 627     iterator.setText(td.fDataToBreak);
 628
 629     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 630     int32_t offset = iterator.first();
 631     int32_t testOffset;
 632     int32_t count = 0;
 633
 634     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 635
 636     if (*testIterator != iterator)
 637         errln("clone() or operator!= failed: two clones compared unequal");
 638
 639     do {
 640         testOffset = testIterator->first();
 641         testOffset = testIterator->next(count);
 642         if (offset != testOffset)
 643             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 644
 645         if (offset != RuleBasedBreakIterator::DONE) {
 646             count++;
 647             offset = iterator.next();
 648
 649             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 650                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 651                 if (count > 10000 || offset == -1) {
 652                     errln("operator== failed too many times. Stopping test.");
 653                     if (offset == -1) {
 654                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 655                     }
 656                     return;
 657                 }
 658             }
 659         }
 660     } while (offset != RuleBasedBreakIterator::DONE);
 661
 662     // now do it backwards...
 663     offset = iterator.last();
 664     count = 0;
 665
 666     do {
 667         testOffset = testIterator->last();
 668         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 669         if (offset != testOffset)
 670             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 671
 672         if (offset != RuleBasedBreakIterator::DONE) {
 673             count--;
 674             offset = iterator.previous();
 675         }
 676     } while (offset != RuleBasedBreakIterator::DONE);
 677
 678     delete testIterator;
 679 }
 680
 681
 682 //---------------------------------------------
 683 //
 684 //     other tests
 685 //
 686 //---------------------------------------------
 687 void RBBITest::TestEmptyString()
 688 {
 689     UnicodeString text = "";
 690     UErrorCode status = U_ZERO_ERROR;
 691
 692     BITestData x(status);
 693     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 694     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 695     if (U_FAILURE(status))
 696     {
 697         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 698         return;
 699     }
 700     generalIteratorTest(*bi, x);
 701     delete bi;
 702 }
 703
 704 void RBBITest::TestGetAvailableLocales()
 705 {
 706     int32_t locCount = 0;
 707     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 708
 709     if (locCount == 0)
 710         dataerrln("getAvailableLocales() returned an empty list!");
 711     // Just make sure that it's returning good memory.
 712     int32_t i;
 713     for (i = 0; i < locCount; ++i) {
 714         logln(locList[i].getName());
 715     }
 716 }
 717
 718 //Testing the BreakIterator::getDisplayName() function
 719 void RBBITest::TestGetDisplayName()
 720 {
 721     UnicodeString   result;
 722
 723     BreakIterator::getDisplayName(Locale::getUS(), result);
 724     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 725         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 726                 + result);
 727
 728     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 729     if (result != "French (France)")
 730         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 731                 + result);
 732 }
 733 /**
 734  * Test End Behaviour
 735  * @bug 4068137
 736  */
 737 void RBBITest::TestEndBehaviour()
 738 {
 739     UErrorCode status = U_ZERO_ERROR;
 740     UnicodeString testString("boo.");
 741     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 742     if (U_FAILURE(status))
 743     {
 744         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 745         return;
 746     }
 747     wb->setText(testString);
 748
 749     if (wb->first() != 0)
 750         errln("Didn't get break at beginning of string.");
 751     if (wb->next() != 3)
 752         errln("Didn't get break before period in \"boo.\"");
 753     if (wb->current() != 4 && wb->next() != 4)
 754         errln("Didn't get break at end of string.");
 755     delete wb;
 756 }
 757 /*
 758  * @bug 4153072
 759  */
 760 void RBBITest::TestBug4153072() {
 761     UErrorCode status = U_ZERO_ERROR;
 762     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 763     if (U_FAILURE(status))
 764     {
 765         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 766         return;
 767     }
 768     UnicodeString str("...Hello, World!...");
 769     int32_t begin = 3;
 770     int32_t end = str.length() - 3;
 771     UBool onBoundary;
 772
 773     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 774     iter->adoptText(textIterator);
 775     int index;
 776     // Note: with the switch to UText, there is no way to restrict the
 777     //       iteration range to begin at an index other than zero.
 778     //       String character iterators created with a non-zero bound are
 779     //         treated by RBBI as being empty.
 780     for (index = -1; index < begin + 1; ++index) {
 781         onBoundary = iter->isBoundary(index);
 782         if (index == 0?  !onBoundary : onBoundary) {
 783             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 784                             " and begin index = " + begin);
 785         }
 786     }
 787     delete iter;
 788 }
 789
 790
 791 //
 792 // Test for problem reported by Ashok Matoria on 9 July 2007
 793 //    One.<kSoftHyphen><kSpace>Two.
 794 //
 795 //    Sentence break at start (0) and then on calling next() it breaks at
 796 //   'T' of "Two". Now, at this point if I do next() and
 797 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 798 //
 799 void RBBITest::TestBug5775() {
 800     UErrorCode status = U_ZERO_ERROR;
 801     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 802     TEST_ASSERT_SUCCESS(status);
 803     if (U_FAILURE(status)) {
 804         return;
 805     }
 806 // Check for status first for better handling of no data errors.
 807     TEST_ASSERT(bi != NULL);
 808     if (bi == NULL) {
 809         return;
 810     }
 811
 812     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 813     //               01234      56789
 814     s = s.unescape();
 815     bi->setText(s);
 816     int pos = bi->next();
 817     TEST_ASSERT(pos == 6);
 818     pos = bi->next();
 819     TEST_ASSERT(pos == 10);
 820     pos = bi->previous();
 821     TEST_ASSERT(pos == 6);
 822     delete bi;
 823 }
 824
 825
 826
 827 //------------------------------------------------------------------------------
 828 //
 829 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 830 //
 831 //------------------------------------------------------------------------------
 832
 833 struct TestParams {
 834     BreakIterator   *bi;
 835     UnicodeString    dataToBreak;
 836     UVector32       *expectedBreaks;
 837     UVector32       *srcLine;
 838     UVector32       *srcCol;
 839 };
 840
 841 void RBBITest::executeTest(TestParams *t) {
 842     int32_t    bp;
 843     int32_t    prevBP;
 844     int32_t    i;
 845
 846     if (t->bi == NULL) {
 847         return;
 848     }
 849
 850     t->bi->setText(t->dataToBreak);
 851     //
 852     //  Run the iterator forward
 853     //
 854     prevBP = -1;
 855     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 856         if (prevBP ==  bp) {
 857             // Fail for lack of forward progress.
 858             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 859                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 860             break;
 861         }
 862
 863         // Check that there were we didn't miss an expected break between the last one
 864         //  and this one.
 865         for (i=prevBP+1; i<bp; i++) {
 866             if (t->expectedBreaks->elementAti(i) != 0) {
 867                 int expected[] = {0, i};
 868                 printStringBreaks(t->dataToBreak, expected, 2);
 869                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 870                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 871             }
 872         }
 873
 874         // Check that the break we did find was expected
 875         if (t->expectedBreaks->elementAti(bp) == 0) {
 876             int expected[] = {0, bp};
 877             printStringBreaks(t->dataToBreak, expected, 2);
 878             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 879                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 880         } else {
 881             // The break was expected.
 882             //   Check that the {nnn} tag value is correct.
 883             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 884             if (expectedTagVal == -1) {
 885                 expectedTagVal = 0;
 886             }
 887             int32_t line = t->srcLine->elementAti(bp);
 888             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 889             if (rs != expectedTagVal) {
 890                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 891                       "          Actual, Expected status = %4d, %4d",
 892                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
 893             }
 894         }
 895
 896
 897         prevBP = bp;
 898     }
 899
 900     // Verify that there were no missed expected breaks after the last one found
 901     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
 902         if (t->expectedBreaks->elementAti(i) != 0) {
 903             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 904                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 905         }
 906     }
 907
 908     //
 909     //  Run the iterator backwards, verify that the same breaks are found.
 910     //
 911     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
 912     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
 913         if (prevBP ==  bp) {
 914             // Fail for lack of progress.
 915             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 916                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 917             break;
 918         }
 919
 920         // Check that there were we didn't miss an expected break between the last one
 921         //  and this one.  (UVector returns zeros for index out of bounds.)
 922         for (i=prevBP-1; i>bp; i--) {
 923             if (t->expectedBreaks->elementAti(i) != 0) {
 924                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 925                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 926             }
 927         }
 928
 929         // Check that the break we did find was expected
 930         if (t->expectedBreaks->elementAti(bp) == 0) {
 931             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 932                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 933         } else {
 934             // The break was expected.
 935             //   Check that the {nnn} tag value is correct.
 936             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 937             if (expectedTagVal == -1) {
 938                 expectedTagVal = 0;
 939             }
 940             int line = t->srcLine->elementAti(bp);
 941             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 942             if (rs != expectedTagVal) {
 943                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
 944                       "          Actual, Expected status = %4d, %4d",
 945                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
 946             }
 947         }
 948
 949         prevBP = bp;
 950     }
 951
 952     // Verify that there were no missed breaks prior to the last one found
 953     for (i=prevBP-1; i>=0; i--) {
 954         if (t->expectedBreaks->elementAti(i) != 0) {
 955             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 956                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 957         }
 958     }
 959
 960     // Check isBoundary()
 961     for (i=0; i<t->expectedBreaks->size(); i++) {
 962         UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
 963         UBool boundaryFound    = t->bi->isBoundary(i);
 964         if (boundaryExpected != boundaryFound) {
 965             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
 966                   "        Expected, Actual= %s, %s",
 967                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
 968                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
 969         }
 970     }
 971
 972     // Check following()
 973     for (i=0; i<t->expectedBreaks->size(); i++) {
 974         int32_t actualBreak = t->bi->following(i);
 975         int32_t expectedBreak = BreakIterator::DONE;
 976         for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
 977             if (t->expectedBreaks->elementAti(j) != 0) {
 978                 expectedBreak = j;
 979                 break;
 980             }
 981         }
 982         if (expectedBreak != actualBreak) {
 983             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
 984                   "        Expected, Actual= %d, %d",
 985                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
 986         }
 987     }
 988
 989     // Check preceding()
 990     for (i=t->expectedBreaks->size(); i>=0; i--) {
 991         int32_t actualBreak = t->bi->preceding(i);
 992         int32_t expectedBreak = BreakIterator::DONE;
 993
 994         for (int32_t j=i-1; j >= 0; j--) {
 995             if (t->expectedBreaks->elementAti(j) != 0) {
 996                 expectedBreak = j;
 997                 break;
 998             }
 999         }
1000         if (expectedBreak != actualBreak) {
1001             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1002                   "        Expected, Actual= %d, %d",
1003                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1004         }
1005     }
1006 }
1007
1008
1009 void RBBITest::TestExtended() {
1010 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1011     UErrorCode      status  = U_ZERO_ERROR;
1012     Locale          locale("");
1013
1014     UnicodeString       rules;
1015     TestParams          tp;
1016     tp.bi             = NULL;
1017     tp.expectedBreaks = new UVector32(status);
1018     tp.srcLine        = new UVector32(status);
1019     tp.srcCol         = new UVector32(status);
1020
1021     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1022     if (U_FAILURE(status)) {
1023         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1024     }
1025
1026
1027     //
1028     //  Open and read the test data file.
1029     //
1030     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1031     char testFileName[1000];
1032     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1033         errln("Can't open test data.  Path too long.");
1034         return;
1035     }
1036     strcpy(testFileName, testDataDirectory);
1037     strcat(testFileName, "rbbitst.txt");
1038
1039     int    len;
1040     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1041     if (U_FAILURE(status)) {
1042         return; /* something went wrong, error already output */
1043     }
1044
1045
1046
1047
1048     //
1049     //  Put the test data into a UnicodeString
1050     //
1051     UnicodeString testString(FALSE, testFile, len);
1052
1053     enum EParseState{
1054         PARSE_COMMENT,
1055         PARSE_TAG,
1056         PARSE_DATA,
1057         PARSE_NUM
1058     }
1059     parseState = PARSE_TAG;
1060
1061     EParseState savedState = PARSE_TAG;
1062
1063     static const UChar CH_LF        = 0x0a;
1064     static const UChar CH_CR        = 0x0d;
1065     static const UChar CH_HASH      = 0x23;
1066     /*static const UChar CH_PERIOD    = 0x2e;*/
1067     static const UChar CH_LT        = 0x3c;
1068     static const UChar CH_GT        = 0x3e;
1069     static const UChar CH_BACKSLASH = 0x5c;
1070     static const UChar CH_BULLET    = 0x2022;
1071
1072     int32_t    lineNum  = 1;
1073     int32_t    colStart = 0;
1074     int32_t    column   = 0;
1075     int32_t    charIdx  = 0;
1076
1077     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1078
1079     for (charIdx = 0; charIdx < len; ) {
1080         status = U_ZERO_ERROR;
1081         UChar  c = testString.charAt(charIdx);
1082         charIdx++;
1083         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1084             // treat CRLF as a unit
1085             c = CH_LF;
1086             charIdx++;
1087         }
1088         if (c == CH_LF || c == CH_CR) {
1089             lineNum++;
1090             colStart = charIdx;
1091         }
1092         column = charIdx - colStart + 1;
1093
1094         switch (parseState) {
1095         case PARSE_COMMENT:
1096             if (c == 0x0a || c == 0x0d) {
1097                 parseState = savedState;
1098             }
1099             break;
1100
1101         case PARSE_TAG:
1102             {
1103             if (c == CH_HASH) {
1104                 parseState = PARSE_COMMENT;
1105                 savedState = PARSE_TAG;
1106                 break;
1107             }
1108             if (u_isUWhiteSpace(c)) {
1109                 break;
1110             }
1111             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1112                 delete tp.bi;
1113                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1114                 charIdx += 5;
1115                 break;
1116             }
1117             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1118                 delete tp.bi;
1119                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1120                 charIdx += 5;
1121                 break;
1122             }
1123             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1124                 delete tp.bi;
1125                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1126                 charIdx += 5;
1127                 break;
1128             }
1129             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1130                 delete tp.bi;
1131                 tp.bi = NULL;
1132                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1133                 charIdx += 5;
1134                 break;
1135             }
1136             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1137                 delete tp.bi;
1138                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1139                 charIdx += 6;
1140                 break;
1141             }
1142
1143             // <locale  loc_name>
1144             localeMatcher.reset(testString);
1145             if (localeMatcher.lookingAt(charIdx-1, status)) {
1146                 UnicodeString localeName = localeMatcher.group(1, status);
1147                 char localeName8[100];
1148                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1149                 locale = Locale::createFromName(localeName8);
1150                 charIdx += localeMatcher.group(0, status).length() - 1;
1151                 TEST_ASSERT_SUCCESS(status);
1152                 break;
1153             }
1154             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1155                 parseState = PARSE_DATA;
1156                 charIdx += 5;
1157                 tp.dataToBreak = "";
1158                 tp.expectedBreaks->removeAllElements();
1159                 tp.srcCol ->removeAllElements();
1160                 tp.srcLine->removeAllElements();
1161                 break;
1162             }
1163
1164             errln("line %d: Tag expected in test file.", lineNum);
1165             parseState = PARSE_COMMENT;
1166             savedState = PARSE_DATA;
1167             goto end_test; // Stop the test.
1168             }
1169             break;
1170
1171         case PARSE_DATA:
1172             if (c == CH_BULLET) {
1173                 int32_t  breakIdx = tp.dataToBreak.length();
1174                 tp.expectedBreaks->setSize(breakIdx+1);
1175                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1176                 tp.srcLine->setSize(breakIdx+1);
1177                 tp.srcLine->setElementAt(lineNum, breakIdx);
1178                 tp.srcCol ->setSize(breakIdx+1);
1179                 tp.srcCol ->setElementAt(column, breakIdx);
1180                 break;
1181             }
1182
1183             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1184                 // Add final entry to mappings from break location to source file position.
1185                 //  Need one extra because last break position returned is after the
1186                 //    last char in the data, not at the last char.
1187                 tp.srcLine->addElement(lineNum, status);
1188                 tp.srcCol ->addElement(column, status);
1189
1190                 parseState = PARSE_TAG;
1191                 charIdx += 6;
1192
1193                 // RUN THE TEST!
1194                 executeTest(&tp);
1195                 break;
1196             }
1197
1198             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1199                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1200                 // Get the code point from the name and insert it into the test data.
1201                 //   (Damn, no API takes names in Unicode  !!!
1202                 //    we've got to take it back to char *)
1203                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1204                 int32_t nameLength = nameEndIdx - (charIdx+2);
1205                 char charNameBuf[200];
1206                 UChar32 theChar = -1;
1207                 if (nameEndIdx != -1) {
1208                     UErrorCode status = U_ZERO_ERROR;
1209                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1210                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1211                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1212                     if (U_FAILURE(status)) {
1213                         theChar = -1;
1214                     }
1215                 }
1216                 if (theChar == -1) {
1217                     errln("Error in named character in test file at line %d, col %d",
1218                         lineNum, column);
1219                 } else {
1220                     // Named code point was recognized.  Insert it
1221                     //   into the test data.
1222                     tp.dataToBreak.append(theChar);
1223                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1224                         tp.srcLine->addElement(lineNum, status);
1225                         tp.srcCol ->addElement(column, status);
1226                     }
1227                 }
1228                 if (nameEndIdx > charIdx) {
1229                     charIdx = nameEndIdx+1;
1230
1231                 }
1232                 break;
1233             }
1234
1235
1236
1237
1238             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1239                 charIdx++;
1240                 int32_t  breakIdx = tp.dataToBreak.length();
1241                 tp.expectedBreaks->setSize(breakIdx+1);
1242                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1243                 tp.srcLine->setSize(breakIdx+1);
1244                 tp.srcLine->setElementAt(lineNum, breakIdx);
1245                 tp.srcCol ->setSize(breakIdx+1);
1246                 tp.srcCol ->setElementAt(column, breakIdx);
1247                 break;
1248             }
1249
1250             if (c == CH_LT) {
1251                 tagValue   = 0;
1252                 parseState = PARSE_NUM;
1253                 break;
1254             }
1255
1256             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1257                 parseState = PARSE_COMMENT;
1258                 savedState = PARSE_DATA;
1259                 break;
1260             }
1261
1262             if (c == CH_BACKSLASH) {
1263                 // Check for \ at end of line, a line continuation.
1264                 //     Advance over (discard) the newline
1265                 UChar32 cp = testString.char32At(charIdx);
1266                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1267                     // We have a CR LF
1268                     //  Need an extra increment of the input ptr to move over both of them
1269                     charIdx++;
1270                 }
1271                 if (cp == CH_LF || cp == CH_CR) {
1272                     lineNum++;
1273                     colStart = charIdx;
1274                     charIdx++;
1275                     break;
1276                 }
1277
1278                 // Let unescape handle the back slash.
1279                 cp = testString.unescapeAt(charIdx);
1280                 if (cp != -1) {
1281                     // Escape sequence was recognized.  Insert the char
1282                     //   into the test data.
1283                     tp.dataToBreak.append(cp);
1284                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1285                         tp.srcLine->addElement(lineNum, status);
1286                         tp.srcCol ->addElement(column, status);
1287                     }
1288                     break;
1289                 }
1290
1291
1292                 // Not a recognized backslash escape sequence.
1293                 // Take the next char as a literal.
1294                 //  TODO:  Should this be an error?
1295                 c = testString.charAt(charIdx);
1296                 charIdx = testString.moveIndex32(charIdx, 1);
1297             }
1298
1299             // Normal, non-escaped data char.
1300             tp.dataToBreak.append(c);
1301
1302             // Save the mapping from offset in the data to line/column numbers in
1303             //   the original input file.  Will be used for better error messages only.
1304             //   If there's an expected break before this char, the slot in the mapping
1305             //     vector will already be set for this char; don't overwrite it.
1306             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1307                 tp.srcLine->addElement(lineNum, status);
1308                 tp.srcCol ->addElement(column, status);
1309             }
1310             break;
1311
1312
1313         case PARSE_NUM:
1314             // We are parsing an expected numeric tag value, like <1234>,
1315             //   within a chunk of data.
1316             if (u_isUWhiteSpace(c)) {
1317                 break;
1318             }
1319
1320             if (c == CH_GT) {
1321                 // Finished the number.  Add the info to the expected break data,
1322                 //   and switch parse state back to doing plain data.
1323                 parseState = PARSE_DATA;
1324                 if (tagValue == 0) {
1325                     tagValue = -1;
1326                 }
1327                 int32_t  breakIdx = tp.dataToBreak.length();
1328                 tp.expectedBreaks->setSize(breakIdx+1);
1329                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1330                 tp.srcLine->setSize(breakIdx+1);
1331                 tp.srcLine->setElementAt(lineNum, breakIdx);
1332                 tp.srcCol ->setSize(breakIdx+1);
1333                 tp.srcCol ->setElementAt(column, breakIdx);
1334                 break;
1335             }
1336
1337             if (u_isdigit(c)) {
1338                 tagValue = tagValue*10 + u_charDigitValue(c);
1339                 break;
1340             }
1341
1342             errln("Syntax Error in test file at line %d, col %d",
1343                 lineNum, column);
1344             parseState = PARSE_COMMENT;
1345             goto end_test; // Stop the test
1346             break;
1347         }
1348
1349
1350         if (U_FAILURE(status)) {
1351             dataerrln("ICU Error %s while parsing test file at line %d.",
1352                 u_errorName(status), lineNum);
1353             status = U_ZERO_ERROR;
1354             goto end_test; // Stop the test
1355         }
1356
1357     }
1358
1359 end_test:
1360     delete tp.bi;
1361     delete tp.expectedBreaks;
1362     delete tp.srcLine;
1363     delete tp.srcCol;
1364     delete [] testFile;
1365 #endif
1366 }
1367
1368
1369 //-------------------------------------------------------------------------------
1370 //
1371 //  TestDictRules   create a break iterator from source rules that includes a
1372 //                  dictionary range.   Regression for bug #7130.  Source rules
1373 //                  do not declare a break iterator type (word, line, sentence, etc.
1374 //                  but the dictionary code, without a type, would loop.
1375 //
1376 //-------------------------------------------------------------------------------
1377 void RBBITest::TestDictRules() {
1378     const char *rules =  "$dictionary = [a-z]; \n"
1379                          "!!forward; \n"
1380                          "$dictionary $dictionary; \n"
1381                          "!!reverse; \n"
1382                          "$dictionary $dictionary; \n";
1383     const char *text = "aa";
1384     UErrorCode status = U_ZERO_ERROR;
1385     UParseError parseError;
1386
1387     RuleBasedBreakIterator bi(rules, parseError, status);
1388     if (U_SUCCESS(status)) {
1389         UnicodeString utext = text;
1390         bi.setText(utext);
1391         int32_t position;
1392         int32_t loops;
1393         for (loops = 0; loops<10; loops++) {
1394             position = bi.next();
1395             if (position == RuleBasedBreakIterator::DONE) {
1396                 break;
1397             }
1398         }
1399         TEST_ASSERT(loops == 1);
1400     } else {
1401         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1402     }
1403 }
1404
1405
1406
1407 //-------------------------------------------------------------------------------
1408 //
1409 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1410 //    return the datain one big UChar * buffer, which the caller must delete.
1411 //
1412 //    parameters:
1413 //          fileName:   the name of the file, with no directory part.  The test data directory
1414 //                      is assumed.
1415 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1416 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1417 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1418 //                      Pass NULL for the system default encoding.
1419 //          status
1420 //    returns:
1421 //                      The file data, converted to UChar.
1422 //                      The caller must delete this when done with
1423 //                           delete [] theBuffer;
1424 //
1425 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1426 //           Move this function to some common place.
1427 //
1428 //--------------------------------------------------------------------------------
1429 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1430     UChar       *retPtr  = NULL;
1431     char        *fileBuf = NULL;
1432     UConverter* conv     = NULL;
1433     FILE        *f       = NULL;
1434
1435     ulen = 0;
1436     if (U_FAILURE(status)) {
1437         return retPtr;
1438     }
1439
1440     //
1441     //  Open the file.
1442     //
1443     f = fopen(fileName, "rb");
1444     if (f == 0) {
1445         dataerrln("Error opening test data file %s\n", fileName);
1446         status = U_FILE_ACCESS_ERROR;
1447         return NULL;
1448     }
1449     //
1450     //  Read it in
1451     //
1452     int   fileSize;
1453     int   amt_read;
1454
1455     fseek( f, 0, SEEK_END);
1456     fileSize = ftell(f);
1457     fileBuf = new char[fileSize];
1458     fseek(f, 0, SEEK_SET);
1459     amt_read = fread(fileBuf, 1, fileSize, f);
1460     if (amt_read != fileSize || fileSize <= 0) {
1461         errln("Error reading test data file.");
1462         goto cleanUpAndReturn;
1463     }
1464
1465     //
1466     // Look for a Unicode Signature (BOM) on the data just read
1467     //
1468     int32_t        signatureLength;
1469     const char *   fileBufC;
1470     const char*    bomEncoding;
1471
1472     fileBufC = fileBuf;
1473     bomEncoding = ucnv_detectUnicodeSignature(
1474         fileBuf, fileSize, &signatureLength, &status);
1475     if(bomEncoding!=NULL ){
1476         fileBufC  += signatureLength;
1477         fileSize  -= signatureLength;
1478         encoding = bomEncoding;
1479     }
1480
1481     //
1482     // Open a converter to take the rule file to UTF-16
1483     //
1484     conv = ucnv_open(encoding, &status);
1485     if (U_FAILURE(status)) {
1486         goto cleanUpAndReturn;
1487     }
1488
1489     //
1490     // Convert the rules to UChar.
1491     //  Preflight first to determine required buffer size.
1492     //
1493     ulen = ucnv_toUChars(conv,
1494         NULL,           //  dest,
1495         0,              //  destCapacity,
1496         fileBufC,
1497         fileSize,
1498         &status);
1499     if (status == U_BUFFER_OVERFLOW_ERROR) {
1500         // Buffer Overflow is expected from the preflight operation.
1501         status = U_ZERO_ERROR;
1502
1503         retPtr = new UChar[ulen+1];
1504         ucnv_toUChars(conv,
1505             retPtr,       //  dest,
1506             ulen+1,
1507             fileBufC,
1508             fileSize,
1509             &status);
1510     }
1511
1512 cleanUpAndReturn:
1513     fclose(f);
1514     delete []fileBuf;
1515     ucnv_close(conv);
1516     if (U_FAILURE(status)) {
1517         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1518         delete []retPtr;
1519         retPtr = 0;
1520         ulen   = 0;
1521     };
1522     return retPtr;
1523 }
1524
1525
1526
1527 //--------------------------------------------------------------------------------------------
1528 //
1529 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1530 //
1531 //-------------------------------------------------------------------------------------------
1532 void RBBITest::TestUnicodeFiles() {
1533     RuleBasedBreakIterator  *bi;
1534     UErrorCode               status = U_ZERO_ERROR;
1535
1536     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1537     TEST_ASSERT_SUCCESS(status);
1538     if (U_SUCCESS(status)) {
1539         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1540     }
1541     delete bi;
1542
1543     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1544     TEST_ASSERT_SUCCESS(status);
1545     if (U_SUCCESS(status)) {
1546         runUnicodeTestData("WordBreakTest.txt", bi);
1547     }
1548     delete bi;
1549
1550     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1551     TEST_ASSERT_SUCCESS(status);
1552     if (U_SUCCESS(status)) {
1553         runUnicodeTestData("SentenceBreakTest.txt", bi);
1554     }
1555     delete bi;
1556
1557     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1558     TEST_ASSERT_SUCCESS(status);
1559     if (U_SUCCESS(status)) {
1560         runUnicodeTestData("LineBreakTest.txt", bi);
1561     }
1562     delete bi;
1563 }
1564
1565
1566 //--------------------------------------------------------------------------------------------
1567 //
1568 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1569 //
1570 //-------------------------------------------------------------------------------------------
1571 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1572 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1573     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1574     UBool isTicket7270Fixed = isICUVersionAtLeast(52, 1);
1575     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1576     UErrorCode  status = U_ZERO_ERROR;
1577
1578     //
1579     //  Open and read the test data file, put it into a UnicodeString.
1580     //
1581     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1582     char testFileName[1000];
1583     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1584         dataerrln("Can't open test data.  Path too long.");
1585         return;
1586     }
1587     strcpy(testFileName, testDataDirectory);
1588     strcat(testFileName, fileName);
1589
1590     logln("Opening data file %s\n", fileName);
1591
1592     int    len;
1593     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1594     if (status != U_FILE_ACCESS_ERROR) {
1595         TEST_ASSERT_SUCCESS(status);
1596         TEST_ASSERT(testFile != NULL);
1597     }
1598     if (U_FAILURE(status) || testFile == NULL) {
1599         return; /* something went wrong, error already output */
1600     }
1601     UnicodeString testFileAsString(TRUE, testFile, len);
1602
1603     //
1604     //  Parse the test data file using a regular expression.
1605     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1606     //     is identified by which group had a match.
1607     //
1608     //    Caputure Group #                  1          2            3            4           5
1609     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1610     //
1611     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1612     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1613     UnicodeString   testString;
1614     UVector32       breakPositions(status);
1615     int             lineNumber = 1;
1616     TEST_ASSERT_SUCCESS(status);
1617     if (U_FAILURE(status)) {
1618         return;
1619     }
1620
1621     //
1622     //  Scan through each test case, building up the string to be broken in testString,
1623     //   and the positions that should be boundaries in the breakPositions vector.
1624     //
1625     int spin = 0;
1626     while (tokenMatcher.find()) {
1627         if(tokenMatcher.hitEnd()) {
1628           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1629              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1630              and caused an infinite loop here on EBCDIC systems!
1631           */
1632           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1633           //       return;
1634         }
1635         if (tokenMatcher.start(1, status) >= 0) {
1636             // Scanned a divide sign, indicating a break position in the test data.
1637             if (testString.length()>0) {
1638                 breakPositions.addElement(testString.length(), status);
1639             }
1640         }
1641         else if (tokenMatcher.start(2, status) >= 0) {
1642             // Scanned an 'x', meaning no break at this position in the test data
1643             //   Nothing to be done here.
1644             }
1645         else if (tokenMatcher.start(3, status) >= 0) {
1646             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1647             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1648             int length = hexNumber.length();
1649             if (length<=8) {
1650                 char buf[10];
1651                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1652                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1653                 if (c<=0x10ffff) {
1654                     testString.append(c);
1655                 } else {
1656                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1657                        fileName, lineNumber);
1658                 }
1659             } else {
1660                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1661                        fileName, lineNumber);
1662              }
1663         }
1664         else if (tokenMatcher.start(4, status) >= 0) {
1665             // Scanned to end of a line, possibly skipping over a comment in the process.
1666             //   If the line from the file contained test data, run the test now.
1667             //
1668             if (testString.length() > 0) {
1669 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1670 //             Rule 8
1671 //                ZW SP* <break>
1672 //             is not yet implemented.
1673 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1674                                             5202 == lineNumber ||
1675                                             5214 == lineNumber ||
1676                                             5246 == lineNumber ||
1677                                             5298 == lineNumber ||
1678                                             5302 == lineNumber ))) {
1679                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1680 }
1681             }
1682
1683             // Clear out this test case.
1684             //    The string and breakPositions vector will be refilled as the next
1685             //       test case is parsed.
1686             testString.remove();
1687             breakPositions.removeAllElements();
1688             lineNumber++;
1689         } else {
1690             // Scanner catchall.  Something unrecognized appeared on the line.
1691             char token[16];
1692             UnicodeString uToken = tokenMatcher.group(0, status);
1693             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1694             token[sizeof(token)-1] = 0;
1695             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1696
1697             // Clean up, in preparation for continuing with the next line.
1698             testString.remove();
1699             breakPositions.removeAllElements();
1700             lineNumber++;
1701         }
1702         TEST_ASSERT_SUCCESS(status);
1703         if (U_FAILURE(status)) {
1704             break;
1705         }
1706     }
1707
1708     delete [] testFile;
1709  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1710 }
1711
1712 //--------------------------------------------------------------------------------------------
1713 //
1714 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1715 //                            test data files.  Do only a simple, forward-only check -
1716 //                            this test is mostly to check that ICU and the Unicode
1717 //                            data agree with each other.
1718 //
1719 //--------------------------------------------------------------------------------------------
1720 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1721                          const UnicodeString &testString,   // Text data to be broken
1722                          UVector32 *breakPositions,         // Positions where breaks should be found.
1723                          RuleBasedBreakIterator *bi) {
1724     int32_t pos;                 // Break Position in the test string
1725     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1726     int32_t expectedPos;         // Expected break position (index into test string)
1727
1728     bi->setText(testString);
1729     pos = bi->first();
1730     pos = bi->next();
1731
1732     while (pos != BreakIterator::DONE) {
1733         if (expectedI >= breakPositions->size()) {
1734             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1735                 testFileName, lineNumber, pos);
1736             break;
1737         }
1738         expectedPos = breakPositions->elementAti(expectedI);
1739         if (pos < expectedPos) {
1740             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1741                 testFileName, lineNumber, pos);
1742             break;
1743         }
1744         if (pos > expectedPos) {
1745             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1746                 testFileName, lineNumber, expectedPos);
1747             break;
1748         }
1749         pos = bi->next();
1750         expectedI++;
1751     }
1752
1753     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1754         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1755             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1756     }
1757 }
1758
1759
1760
1761 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1762 //---------------------------------------------------------------------------------------
1763 //
1764 //   classs RBBIMonkeyKind
1765 //
1766 //      Monkey Test for Break Iteration
1767 //      Abstract interface class.   Concrete derived classes independently
1768 //      implement the break rules for different iterator types.
1769 //
1770 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1771 //      testing, but works purely in terms of the interface defined here.
1772 //
1773 //---------------------------------------------------------------------------------------
1774 class RBBIMonkeyKind {
1775 public:
1776     // Return a UVector of UnicodeSets, representing the character classes used
1777     //   for this type of iterator.
1778     virtual  UVector  *charClasses() = 0;
1779
1780     // Set the test text on which subsequent calls to next() will operate
1781     virtual  void      setText(const UnicodeString &s) = 0;
1782
1783     // Find the next break postion, starting from the prev break position, or from zero.
1784     // Return -1 after reaching end of string.
1785     virtual  int32_t   next(int32_t i) = 0;
1786
1787     virtual ~RBBIMonkeyKind();
1788     UErrorCode       deferredStatus;
1789
1790
1791 protected:
1792     RBBIMonkeyKind();
1793
1794 private:
1795 };
1796
1797 RBBIMonkeyKind::RBBIMonkeyKind() {
1798     deferredStatus = U_ZERO_ERROR;
1799 }
1800
1801 RBBIMonkeyKind::~RBBIMonkeyKind() {
1802 }
1803
1804
1805 //----------------------------------------------------------------------------------------
1806 //
1807 //   Random Numbers.  Similar to standard lib rand() and srand()
1808 //                    Not using library to
1809 //                      1.  Get same results on all platforms.
1810 //                      2.  Get access to current seed, to more easily reproduce failures.
1811 //
1812 //---------------------------------------------------------------------------------------
1813 static uint32_t m_seed = 1;
1814
1815 static uint32_t m_rand()
1816 {
1817     m_seed = m_seed * 1103515245 + 12345;
1818     return (uint32_t)(m_seed/65536) % 32768;
1819 }
1820
1821
1822 //------------------------------------------------------------------------------------------
1823 //
1824 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1825 //                             of RBBIMonkeyKind.
1826 //
1827 //------------------------------------------------------------------------------------------
1828 class RBBICharMonkey: public RBBIMonkeyKind {
1829 public:
1830     RBBICharMonkey();
1831     virtual          ~RBBICharMonkey();
1832     virtual  UVector *charClasses();
1833     virtual  void     setText(const UnicodeString &s);
1834     virtual  int32_t  next(int32_t i);
1835 private:
1836     UVector   *fSets;
1837
1838     UnicodeSet  *fCRLFSet;
1839     UnicodeSet  *fControlSet;
1840     UnicodeSet  *fExtendSet;
1841     UnicodeSet  *fRegionalIndicatorSet;
1842     UnicodeSet  *fPrependSet;
1843     UnicodeSet  *fSpacingSet;
1844     UnicodeSet  *fLSet;
1845     UnicodeSet  *fVSet;
1846     UnicodeSet  *fTSet;
1847     UnicodeSet  *fLVSet;
1848     UnicodeSet  *fLVTSet;
1849     UnicodeSet  *fHangulSet;
1850     UnicodeSet  *fAnySet;
1851
1852     const UnicodeString *fText;
1853 };
1854
1855
1856 RBBICharMonkey::RBBICharMonkey() {
1857     UErrorCode  status = U_ZERO_ERROR;
1858
1859     fText = NULL;
1860
1861     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1862     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1863     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1864     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1865     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1866     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1867     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1868     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1869     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1870     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1871     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1872     fHangulSet  = new UnicodeSet();
1873     fHangulSet->addAll(*fLSet);
1874     fHangulSet->addAll(*fVSet);
1875     fHangulSet->addAll(*fTSet);
1876     fHangulSet->addAll(*fLVSet);
1877     fHangulSet->addAll(*fLVTSet);
1878     fAnySet     = new UnicodeSet(0, 0x10ffff);
1879
1880     fSets       = new UVector(status);
1881     fSets->addElement(fCRLFSet,    status);
1882     fSets->addElement(fControlSet, status);
1883     fSets->addElement(fExtendSet,  status);
1884     fSets->addElement(fRegionalIndicatorSet, status);
1885     if (!fPrependSet->isEmpty()) {
1886         fSets->addElement(fPrependSet, status);
1887     }
1888     fSets->addElement(fSpacingSet, status);
1889     fSets->addElement(fHangulSet,  status);
1890     fSets->addElement(fAnySet,     status);
1891     if (U_FAILURE(status)) {
1892         deferredStatus = status;
1893     }
1894 }
1895
1896
1897 void RBBICharMonkey::setText(const UnicodeString &s) {
1898     fText = &s;
1899 }
1900
1901
1902
1903 int32_t RBBICharMonkey::next(int32_t prevPos) {
1904     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1905                               //   break position being tested.  The candidate break
1906                               //   location is before p2.
1907
1908     int     breakPos = -1;
1909
1910     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1911
1912     if (U_FAILURE(deferredStatus)) {
1913         return -1;
1914     }
1915
1916     // Previous break at end of string.  return DONE.
1917     if (prevPos >= fText->length()) {
1918         return -1;
1919     }
1920     p0 = p1 = p2 = p3 = prevPos;
1921     c3 =  fText->char32At(prevPos);
1922     c0 = c1 = c2 = 0;
1923
1924     // Loop runs once per "significant" character position in the input text.
1925     for (;;) {
1926         // Move all of the positions forward in the input string.
1927         p0 = p1;  c0 = c1;
1928         p1 = p2;  c1 = c2;
1929         p2 = p3;  c2 = c3;
1930
1931         // Advancd p3 by one codepoint
1932         p3 = fText->moveIndex32(p3, 1);
1933         c3 = fText->char32At(p3);
1934
1935         if (p1 == p2) {
1936             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1937             continue;
1938         }
1939         if (p2 == fText->length()) {
1940             // Reached end of string.  Always a break position.
1941             break;
1942         }
1943
1944         // Rule  GB3   CR x LF
1945         //     No Extend or Format characters may appear between the CR and LF,
1946         //     which requires the additional check for p2 immediately following p1.
1947         //
1948         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1949             continue;
1950         }
1951
1952         // Rule (GB4).   ( Control | CR | LF ) <break>
1953         if (fControlSet->contains(c1) ||
1954             c1 == 0x0D ||
1955             c1 == 0x0A)  {
1956             break;
1957         }
1958
1959         // Rule (GB5)    <break>  ( Control | CR | LF )
1960         //
1961         if (fControlSet->contains(c2) ||
1962             c2 == 0x0D ||
1963             c2 == 0x0A)  {
1964             break;
1965         }
1966
1967
1968         // Rule (GB6)  L x ( L | V | LV | LVT )
1969         if (fLSet->contains(c1) &&
1970                (fLSet->contains(c2)  ||
1971                 fVSet->contains(c2)  ||
1972                 fLVSet->contains(c2) ||
1973                 fLVTSet->contains(c2))) {
1974             continue;
1975         }
1976
1977         // Rule (GB7)    ( LV | V )  x  ( V | T )
1978         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1979             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1980             continue;
1981         }
1982
1983         // Rule (GB8)    ( LVT | T)  x T
1984         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1985             fTSet->contains(c2))  {
1986             continue;
1987         }
1988
1989         // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1990
1991         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1992         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1993             continue;
1994         }
1995
1996         // Rule (GB9)    Numeric x ALetter
1997         if (fExtendSet->contains(c2))  {
1998             continue;
1999         }
2000
2001         // Rule (GB9a)   x  SpacingMark
2002         if (fSpacingSet->contains(c2)) {
2003             continue;
2004         }
2005
2006         // Rule (GB9b)   Prepend x
2007         if (fPrependSet->contains(c1)) {
2008             continue;
2009         }
2010
2011         // Rule (GB10)  Any  <break>  Any
2012         break;
2013     }
2014
2015     breakPos = p2;
2016     return breakPos;
2017 }
2018
2019
2020
2021 UVector  *RBBICharMonkey::charClasses() {
2022     return fSets;
2023 }
2024
2025
2026 RBBICharMonkey::~RBBICharMonkey() {
2027     delete fSets;
2028     delete fCRLFSet;
2029     delete fControlSet;
2030     delete fExtendSet;
2031     delete fRegionalIndicatorSet;
2032     delete fPrependSet;
2033     delete fSpacingSet;
2034     delete fLSet;
2035     delete fVSet;
2036     delete fTSet;
2037     delete fLVSet;
2038     delete fLVTSet;
2039     delete fHangulSet;
2040     delete fAnySet;
2041 }
2042
2043 //------------------------------------------------------------------------------------------
2044 //
2045 //   class RBBIWordMonkey      Word Break specific implementation
2046 //                             of RBBIMonkeyKind.
2047 //
2048 //------------------------------------------------------------------------------------------
2049 class RBBIWordMonkey: public RBBIMonkeyKind {
2050 public:
2051     RBBIWordMonkey();
2052     virtual          ~RBBIWordMonkey();
2053     virtual  UVector *charClasses();
2054     virtual  void     setText(const UnicodeString &s);
2055     virtual int32_t   next(int32_t i);
2056 private:
2057     UVector      *fSets;
2058
2059     UnicodeSet  *fCRSet;
2060     UnicodeSet  *fLFSet;
2061     UnicodeSet  *fNewlineSet;
2062     UnicodeSet  *fKatakanaSet;
2063     UnicodeSet  *fALetterSet;
2064     // TODO(jungshik): Do we still need this change?
2065     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2066     UnicodeSet  *fMidNumLetSet;
2067     UnicodeSet  *fMidLetterSet;
2068     UnicodeSet  *fMidNumSet;
2069     UnicodeSet  *fNumericSet;
2070     UnicodeSet  *fFormatSet;
2071     UnicodeSet  *fOtherSet;
2072     UnicodeSet  *fExtendSet;
2073     UnicodeSet  *fExtendNumLetSet;
2074     UnicodeSet  *fRegionalIndicatorSet;
2075     UnicodeSet  *fDictionaryCjkSet;
2076
2077     RegexMatcher  *fMatcher;
2078
2079     const UnicodeString  *fText;
2080 };
2081
2082
2083 RBBIWordMonkey::RBBIWordMonkey()
2084 {
2085     UErrorCode  status = U_ZERO_ERROR;
2086
2087     fSets            = new UVector(status);
2088
2089     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2090     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2091     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2092     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2093     // Exclude Hangul syllables from ALetterSet during testing.
2094     // Leave CJK dictionary characters out from the monkey tests!
2095 #if 0
2096     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2097                                       "[\\p{Line_Break = Complex_Context}"
2098                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2099                                       "-\\p{Grapheme_Cluster_Break = Control}"
2100                                       "]]",
2101                                       status);
2102 #endif
2103     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2104     fALetterSet->removeAll(*fDictionaryCjkSet);
2105     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2106     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2107     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2108     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2109     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2110     // we should figure out why
2111     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2112     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2113     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2114     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2115     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2116
2117     fOtherSet        = new UnicodeSet();
2118     if(U_FAILURE(status)) {
2119       deferredStatus = status;
2120       return;
2121     }
2122
2123     fOtherSet->complement();
2124     fOtherSet->removeAll(*fCRSet);
2125     fOtherSet->removeAll(*fLFSet);
2126     fOtherSet->removeAll(*fNewlineSet);
2127     fOtherSet->removeAll(*fKatakanaSet);
2128     fOtherSet->removeAll(*fALetterSet);
2129     fOtherSet->removeAll(*fMidLetterSet);
2130     fOtherSet->removeAll(*fMidNumSet);
2131     fOtherSet->removeAll(*fNumericSet);
2132     fOtherSet->removeAll(*fExtendNumLetSet);
2133     fOtherSet->removeAll(*fFormatSet);
2134     fOtherSet->removeAll(*fExtendSet);
2135     fOtherSet->removeAll(*fRegionalIndicatorSet);
2136     // Inhibit dictionary characters from being tested at all.
2137     fOtherSet->removeAll(*fDictionaryCjkSet);
2138     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2139
2140     fSets->addElement(fCRSet,        status);
2141     fSets->addElement(fLFSet,        status);
2142     fSets->addElement(fNewlineSet,   status);
2143     fSets->addElement(fALetterSet,   status);
2144     //fSets->addElement(fKatakanaSet,  status); //TODO: work out how to test katakana
2145     fSets->addElement(fMidLetterSet, status);
2146     fSets->addElement(fMidNumLetSet, status);
2147     fSets->addElement(fMidNumSet,    status);
2148     fSets->addElement(fNumericSet,   status);
2149     fSets->addElement(fFormatSet,    status);
2150     fSets->addElement(fExtendSet,    status);
2151     fSets->addElement(fOtherSet,     status);
2152     fSets->addElement(fExtendNumLetSet, status);
2153     fSets->addElement(fRegionalIndicatorSet, status);
2154
2155     if (U_FAILURE(status)) {
2156         deferredStatus = status;
2157     }
2158 }
2159
2160 void RBBIWordMonkey::setText(const UnicodeString &s) {
2161     fText       = &s;
2162 }
2163
2164
2165 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2166     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2167                               //   break position being tested.  The candidate break
2168                               //   location is before p2.
2169
2170     int     breakPos = -1;
2171
2172     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2173
2174     if (U_FAILURE(deferredStatus)) {
2175         return -1;
2176     }
2177
2178     // Prev break at end of string.  return DONE.
2179     if (prevPos >= fText->length()) {
2180         return -1;
2181     }
2182     p0 = p1 = p2 = p3 = prevPos;
2183     c3 =  fText->char32At(prevPos);
2184     c0 = c1 = c2 = 0;
2185
2186     // Loop runs once per "significant" character position in the input text.
2187     for (;;) {
2188         // Move all of the positions forward in the input string.
2189         p0 = p1;  c0 = c1;
2190         p1 = p2;  c1 = c2;
2191         p2 = p3;  c2 = c3;
2192
2193         // Advancd p3 by    X(Extend | Format)*   Rule 4
2194         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2195         do {
2196             p3 = fText->moveIndex32(p3, 1);
2197             c3 = fText->char32At(p3);
2198             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2199                break;
2200             };
2201         }
2202         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2203
2204
2205         if (p1 == p2) {
2206             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2207             continue;
2208         }
2209         if (p2 == fText->length()) {
2210             // Reached end of string.  Always a break position.
2211             break;
2212         }
2213
2214         // Rule  (3)   CR x LF
2215         //     No Extend or Format characters may appear between the CR and LF,
2216         //     which requires the additional check for p2 immediately following p1.
2217         //
2218         if (c1==0x0D && c2==0x0A) {
2219             continue;
2220         }
2221
2222         // Rule (3a)  Break before and after newlines (including CR and LF)
2223         //
2224         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2225             break;
2226         };
2227         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2228             break;
2229         };
2230
2231         // Rule (5).   ALetter x ALetter
2232         if (fALetterSet->contains(c1) &&
2233             fALetterSet->contains(c2))  {
2234             continue;
2235         }
2236
2237         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2238         //
2239         if ( fALetterSet->contains(c1)   &&
2240              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2241              fALetterSet->contains(c3)) {
2242             continue;
2243         }
2244
2245
2246         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2247         if (fALetterSet->contains(c0) &&
2248             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2249             fALetterSet->contains(c2)) {
2250             continue;
2251         }
2252
2253         // Rule (8)    Numeric x Numeric
2254         if (fNumericSet->contains(c1) &&
2255             fNumericSet->contains(c2))  {
2256             continue;
2257         }
2258
2259         // Rule (9)    ALetter x Numeric
2260         if (fALetterSet->contains(c1) &&
2261             fNumericSet->contains(c2))  {
2262             continue;
2263         }
2264
2265         // Rule (10)    Numeric x ALetter
2266         if (fNumericSet->contains(c1) &&
2267             fALetterSet->contains(c2))  {
2268             continue;
2269         }
2270
2271         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2272         if (fNumericSet->contains(c0) &&
2273             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2274             fNumericSet->contains(c2)) {
2275             continue;
2276         }
2277
2278         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2279         if (fNumericSet->contains(c1) &&
2280             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2281             fNumericSet->contains(c3)) {
2282             continue;
2283         }
2284
2285         // Rule (13)  Katakana x Katakana
2286         if (fKatakanaSet->contains(c1) &&
2287             fKatakanaSet->contains(c2))  {
2288             continue;
2289         }
2290
2291         // Rule 13a
2292         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2293              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2294              fExtendNumLetSet->contains(c2)) {
2295                 continue;
2296         }
2297
2298         // Rule 13b
2299         if (fExtendNumLetSet->contains(c1) &&
2300                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2301                 fKatakanaSet->contains(c2)))  {
2302                 continue;
2303         }
2304
2305         // Rule 13c
2306         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2307             continue;
2308         }
2309
2310         // Rule 14.  Break found here.
2311         break;
2312     }
2313
2314     breakPos = p2;
2315     return breakPos;
2316 }
2317
2318
2319 UVector  *RBBIWordMonkey::charClasses() {
2320     return fSets;
2321 }
2322
2323
2324 RBBIWordMonkey::~RBBIWordMonkey() {
2325     delete fSets;
2326     delete fCRSet;
2327     delete fLFSet;
2328     delete fNewlineSet;
2329     delete fKatakanaSet;
2330     delete fALetterSet;
2331     delete fMidNumLetSet;
2332     delete fMidLetterSet;
2333     delete fMidNumSet;
2334     delete fNumericSet;
2335     delete fFormatSet;
2336     delete fExtendSet;
2337     delete fExtendNumLetSet;
2338     delete fRegionalIndicatorSet;
2339     delete fDictionaryCjkSet;
2340     delete fOtherSet;
2341 }
2342
2343
2344
2345
2346 //------------------------------------------------------------------------------------------
2347 //
2348 //   class RBBISentMonkey      Sentence Break specific implementation
2349 //                             of RBBIMonkeyKind.
2350 //
2351 //------------------------------------------------------------------------------------------
2352 class RBBISentMonkey: public RBBIMonkeyKind {
2353 public:
2354     RBBISentMonkey();
2355     virtual          ~RBBISentMonkey();
2356     virtual  UVector *charClasses();
2357     virtual  void     setText(const UnicodeString &s);
2358     virtual int32_t   next(int32_t i);
2359 private:
2360     int               moveBack(int posFrom);
2361     int               moveForward(int posFrom);
2362     UChar32           cAt(int pos);
2363
2364     UVector      *fSets;
2365
2366     UnicodeSet  *fSepSet;
2367     UnicodeSet  *fFormatSet;
2368     UnicodeSet  *fSpSet;
2369     UnicodeSet  *fLowerSet;
2370     UnicodeSet  *fUpperSet;
2371     UnicodeSet  *fOLetterSet;
2372     UnicodeSet  *fNumericSet;
2373     UnicodeSet  *fATermSet;
2374     UnicodeSet  *fSContinueSet;
2375     UnicodeSet  *fSTermSet;
2376     UnicodeSet  *fCloseSet;
2377     UnicodeSet  *fOtherSet;
2378     UnicodeSet  *fExtendSet;
2379
2380     const UnicodeString  *fText;
2381
2382 };
2383
2384 RBBISentMonkey::RBBISentMonkey()
2385 {
2386     UErrorCode  status = U_ZERO_ERROR;
2387
2388     fSets            = new UVector(status);
2389
2390     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2391     //                       set and made into character classes of their own.  For the monkey impl,
2392     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2393     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2394     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2395     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2396     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2397     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2398     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2399     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2400     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2401     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2402     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2403     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2404     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2405     fOtherSet        = new UnicodeSet();
2406
2407     if(U_FAILURE(status)) {
2408       deferredStatus = status;
2409       return;
2410     }
2411
2412     fOtherSet->complement();
2413     fOtherSet->removeAll(*fSepSet);
2414     fOtherSet->removeAll(*fFormatSet);
2415     fOtherSet->removeAll(*fSpSet);
2416     fOtherSet->removeAll(*fLowerSet);
2417     fOtherSet->removeAll(*fUpperSet);
2418     fOtherSet->removeAll(*fOLetterSet);
2419     fOtherSet->removeAll(*fNumericSet);
2420     fOtherSet->removeAll(*fATermSet);
2421     fOtherSet->removeAll(*fSContinueSet);
2422     fOtherSet->removeAll(*fSTermSet);
2423     fOtherSet->removeAll(*fCloseSet);
2424     fOtherSet->removeAll(*fExtendSet);
2425
2426     fSets->addElement(fSepSet,       status);
2427     fSets->addElement(fFormatSet,    status);
2428     fSets->addElement(fSpSet,        status);
2429     fSets->addElement(fLowerSet,     status);
2430     fSets->addElement(fUpperSet,     status);
2431     fSets->addElement(fOLetterSet,   status);
2432     fSets->addElement(fNumericSet,   status);
2433     fSets->addElement(fATermSet,     status);
2434     fSets->addElement(fSContinueSet, status);
2435     fSets->addElement(fSTermSet,     status);
2436     fSets->addElement(fCloseSet,     status);
2437     fSets->addElement(fOtherSet,     status);
2438     fSets->addElement(fExtendSet,    status);
2439
2440     if (U_FAILURE(status)) {
2441         deferredStatus = status;
2442     }
2443 }
2444
2445
2446
2447 void RBBISentMonkey::setText(const UnicodeString &s) {
2448     fText       = &s;
2449 }
2450
2451 UVector  *RBBISentMonkey::charClasses() {
2452     return fSets;
2453 }
2454
2455
2456 //  moveBack()   Find the "significant" code point preceding the index i.
2457 //               Skips over ($Extend | $Format)* .
2458 //
2459 int RBBISentMonkey::moveBack(int i) {
2460     if (i <= 0) {
2461         return -1;
2462     }
2463     UChar32   c;
2464     int32_t   j = i;
2465     do {
2466         j = fText->moveIndex32(j, -1);
2467         c = fText->char32At(j);
2468     }
2469     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2470     return j;
2471
2472  }
2473
2474
2475 int RBBISentMonkey::moveForward(int i) {
2476     if (i>=fText->length()) {
2477         return fText->length();
2478     }
2479     UChar32   c;
2480     int32_t   j = i;
2481     do {
2482         j = fText->moveIndex32(j, 1);
2483         c = cAt(j);
2484     }
2485     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2486     return j;
2487 }
2488
2489 UChar32 RBBISentMonkey::cAt(int pos) {
2490     if (pos<0 || pos>=fText->length()) {
2491         return -1;
2492     } else {
2493         return fText->char32At(pos);
2494     }
2495 }
2496
2497 int32_t RBBISentMonkey::next(int32_t prevPos) {
2498     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2499                               //   break position being tested.  The candidate break
2500                               //   location is before p2.
2501
2502     int     breakPos = -1;
2503
2504     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2505     UChar32 c;
2506
2507     if (U_FAILURE(deferredStatus)) {
2508         return -1;
2509     }
2510
2511     // Prev break at end of string.  return DONE.
2512     if (prevPos >= fText->length()) {
2513         return -1;
2514     }
2515     p0 = p1 = p2 = p3 = prevPos;
2516     c3 =  fText->char32At(prevPos);
2517     c0 = c1 = c2 = 0;
2518
2519     // Loop runs once per "significant" character position in the input text.
2520     for (;;) {
2521         // Move all of the positions forward in the input string.
2522         p0 = p1;  c0 = c1;
2523         p1 = p2;  c1 = c2;
2524         p2 = p3;  c2 = c3;
2525
2526         // Advancd p3 by    X(Extend | Format)*   Rule 4
2527         p3 = moveForward(p3);
2528         c3 = cAt(p3);
2529
2530         // Rule (3)  CR x LF
2531         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2532             continue;
2533         }
2534
2535         // Rule (4).   Sep  <break>
2536         if (fSepSet->contains(c1)) {
2537             p2 = p1+1;   // Separators don't combine with Extend or Format.
2538             break;
2539         }
2540
2541         if (p2 >= fText->length()) {
2542             // Reached end of string.  Always a break position.
2543             break;
2544         }
2545
2546         if (p2 == prevPos) {
2547             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2548             continue;
2549         }
2550
2551         // Rule (6).   ATerm x Numeric
2552         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2553             continue;
2554         }
2555
2556         // Rule (7).  Upper ATerm  x  Uppper
2557         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2558             continue;
2559         }
2560
2561         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2562         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2563         //                  note to the Unicode 5.0 documents.
2564         int p8 = p1;
2565         while (fSpSet->contains(cAt(p8))) {
2566             p8 = moveBack(p8);
2567         }
2568         while (fCloseSet->contains(cAt(p8))) {
2569             p8 = moveBack(p8);
2570         }
2571         if (fATermSet->contains(cAt(p8))) {
2572             p8=p2;
2573             for (;;) {
2574                 c = cAt(p8);
2575                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2576                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2577                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2578                     break;
2579                 }
2580                 p8 = moveForward(p8);
2581             }
2582             if (fLowerSet->contains(cAt(p8))) {
2583                 continue;
2584             }
2585         }
2586
2587         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2588         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2589             p8 = p1;
2590             while (fSpSet->contains(cAt(p8))) {
2591                 p8 = moveBack(p8);
2592             }
2593             while (fCloseSet->contains(cAt(p8))) {
2594                 p8 = moveBack(p8);
2595             }
2596             c = cAt(p8);
2597             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2598                 continue;
2599             }
2600         }
2601
2602         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2603         int p9 = p1;
2604         while (fCloseSet->contains(cAt(p9))) {
2605             p9 = moveBack(p9);
2606         }
2607         c = cAt(p9);
2608         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2609             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2610                 continue;
2611             }
2612         }
2613
2614         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2615         int p10 = p1;
2616         while (fSpSet->contains(cAt(p10))) {
2617             p10 = moveBack(p10);
2618         }
2619         while (fCloseSet->contains(cAt(p10))) {
2620             p10 = moveBack(p10);
2621         }
2622         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2623             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2624                 continue;
2625             }
2626         }
2627
2628         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2629         int p11 = p1;
2630         if (fSepSet->contains(cAt(p11))) {
2631             p11 = moveBack(p11);
2632         }
2633         while (fSpSet->contains(cAt(p11))) {
2634             p11 = moveBack(p11);
2635         }
2636         while (fCloseSet->contains(cAt(p11))) {
2637             p11 = moveBack(p11);
2638         }
2639         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2640             break;
2641         }
2642
2643         //  Rule (12)  Any x Any
2644         continue;
2645     }
2646     breakPos = p2;
2647     return breakPos;
2648 }
2649
2650 RBBISentMonkey::~RBBISentMonkey() {
2651     delete fSets;
2652     delete fSepSet;
2653     delete fFormatSet;
2654     delete fSpSet;
2655     delete fLowerSet;
2656     delete fUpperSet;
2657     delete fOLetterSet;
2658     delete fNumericSet;
2659     delete fATermSet;
2660     delete fSContinueSet;
2661     delete fSTermSet;
2662     delete fCloseSet;
2663     delete fOtherSet;
2664     delete fExtendSet;
2665 }
2666
2667
2668
2669 //-------------------------------------------------------------------------------------------
2670 //
2671 //  RBBILineMonkey
2672 //
2673 //-------------------------------------------------------------------------------------------
2674
2675 class RBBILineMonkey: public RBBIMonkeyKind {
2676 public:
2677     RBBILineMonkey();
2678     virtual          ~RBBILineMonkey();
2679     virtual  UVector *charClasses();
2680     virtual  void     setText(const UnicodeString &s);
2681     virtual  int32_t  next(int32_t i);
2682     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2683 private:
2684     UVector      *fSets;
2685
2686     UnicodeSet  *fBK;
2687     UnicodeSet  *fCR;
2688     UnicodeSet  *fLF;
2689     UnicodeSet  *fCM;
2690     UnicodeSet  *fNL;
2691     UnicodeSet  *fSG;
2692     UnicodeSet  *fWJ;
2693     UnicodeSet  *fZW;
2694     UnicodeSet  *fGL;
2695     UnicodeSet  *fCB;
2696     UnicodeSet  *fSP;
2697     UnicodeSet  *fB2;
2698     UnicodeSet  *fBA;
2699     UnicodeSet  *fBB;
2700     UnicodeSet  *fHY;
2701     UnicodeSet  *fH2;
2702     UnicodeSet  *fH3;
2703     UnicodeSet  *fCL;
2704     UnicodeSet  *fCP;
2705     UnicodeSet  *fEX;
2706     UnicodeSet  *fIN;
2707     UnicodeSet  *fJL;
2708     UnicodeSet  *fJV;
2709     UnicodeSet  *fJT;
2710     UnicodeSet  *fNS;
2711     UnicodeSet  *fOP;
2712     UnicodeSet  *fQU;
2713     UnicodeSet  *fIS;
2714     UnicodeSet  *fNU;
2715     UnicodeSet  *fPO;
2716     UnicodeSet  *fPR;
2717     UnicodeSet  *fSY;
2718     UnicodeSet  *fAI;
2719     UnicodeSet  *fAL;
2720     UnicodeSet  *fCJ;
2721     UnicodeSet  *fHL;
2722     UnicodeSet  *fID;
2723     UnicodeSet  *fRI;
2724     UnicodeSet  *fSA;
2725     UnicodeSet  *fXX;
2726
2727     BreakIterator  *fCharBI;
2728
2729     const UnicodeString  *fText;
2730     int32_t              *fOrigPositions;
2731
2732     RegexMatcher         *fNumberMatcher;
2733     RegexMatcher         *fLB11Matcher;
2734 };
2735
2736
2737 RBBILineMonkey::RBBILineMonkey()
2738 {
2739     UErrorCode  status = U_ZERO_ERROR;
2740
2741     fSets  = new UVector(status);
2742
2743     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2744     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2745     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2746     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2747     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2748     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2749     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2750     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2751     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2752     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2753     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2754     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2755     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2756     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2757     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2758     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2759     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2760     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2761     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2762     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2763     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2764     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2765     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2766     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2767     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2768     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2769     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2770     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2771     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2772     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2773     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2774     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2775     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2776     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2777     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2778     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2779     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2780     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2781     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2782     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2783
2784     if (U_FAILURE(status)) {
2785         deferredStatus = status;
2786         fCharBI = NULL;
2787         fNumberMatcher = NULL;
2788         return;
2789     }
2790
2791     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2792     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2793     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2794     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2795
2796     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2797
2798     fSets->addElement(fBK, status);
2799     fSets->addElement(fCR, status);
2800     fSets->addElement(fLF, status);
2801     fSets->addElement(fCM, status);
2802     fSets->addElement(fNL, status);
2803     fSets->addElement(fWJ, status);
2804     fSets->addElement(fZW, status);
2805     fSets->addElement(fGL, status);
2806     fSets->addElement(fCB, status);
2807     fSets->addElement(fSP, status);
2808     fSets->addElement(fB2, status);
2809     fSets->addElement(fBA, status);
2810     fSets->addElement(fBB, status);
2811     fSets->addElement(fHY, status);
2812     fSets->addElement(fH2, status);
2813     fSets->addElement(fH3, status);
2814     fSets->addElement(fCL, status);
2815     fSets->addElement(fCP, status);
2816     fSets->addElement(fEX, status);
2817     fSets->addElement(fIN, status);
2818     fSets->addElement(fJL, status);
2819     fSets->addElement(fJT, status);
2820     fSets->addElement(fJV, status);
2821     fSets->addElement(fNS, status);
2822     fSets->addElement(fOP, status);
2823     fSets->addElement(fQU, status);
2824     fSets->addElement(fIS, status);
2825     fSets->addElement(fNU, status);
2826     fSets->addElement(fPO, status);
2827     fSets->addElement(fPR, status);
2828     fSets->addElement(fSY, status);
2829     fSets->addElement(fAI, status);
2830     fSets->addElement(fAL, status);
2831     fSets->addElement(fHL, status);
2832     fSets->addElement(fID, status);
2833     fSets->addElement(fWJ, status);
2834     fSets->addElement(fRI, status);
2835     fSets->addElement(fSA, status);
2836     fSets->addElement(fSG, status);
2837
2838     const char *rules =
2839             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2840             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2841             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2842             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2843             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2844             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2845
2846     fNumberMatcher = new RegexMatcher(
2847         UnicodeString(rules, -1, US_INV), 0, status);
2848
2849     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2850
2851     if (U_FAILURE(status)) {
2852         deferredStatus = status;
2853     }
2854 }
2855
2856
2857 void RBBILineMonkey::setText(const UnicodeString &s) {
2858     fText       = &s;
2859     fCharBI->setText(s);
2860     fNumberMatcher->reset(s);
2861 }
2862
2863 //
2864 //  rule9Adjust
2865 //     Line Break TR rules 9 and 10 implementation.
2866 //     This deals with combining marks and other sequences that
2867 //     that must be treated as if they were something other than what they actually are.
2868 //
2869 //     This is factored out into a separate function because it must be applied twice for
2870 //     each potential break, once to the chars before the position being checked, then
2871 //     again to the text following the possible break.
2872 //
2873 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2874     if (pos == -1) {
2875         // Invalid initial position.  Happens during the warmup iteration of the
2876         //   main loop in next().
2877         return;
2878     }
2879
2880     int32_t  nPos = *nextPos;
2881
2882     // LB 9  Keep combining sequences together.
2883     //  advance over any CM class chars.  Note that Line Break CM is different
2884     //  from the normal Grapheme Extend property.
2885     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2886           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2887         for (;;) {
2888             *nextChar = fText->char32At(nPos);
2889             if (!fCM->contains(*nextChar)) {
2890                 break;
2891             }
2892             nPos = fText->moveIndex32(nPos, 1);
2893         }
2894     }
2895
2896
2897     // LB 9 Treat X CM* as if it were x.
2898     //       No explicit action required.
2899
2900     // LB 10  Treat any remaining combining mark as AL
2901     if (fCM->contains(*posChar)) {
2902         *posChar = 0x41;   // thisChar = 'A';
2903     }
2904
2905     // Push the updated nextPos and nextChar back to our caller.
2906     // This only makes a difference if posChar got bigger by consuming a
2907     // combining sequence.
2908     *nextPos  = nPos;
2909     *nextChar = fText->char32At(nPos);
2910 }
2911
2912
2913
2914 int32_t RBBILineMonkey::next(int32_t startPos) {
2915     UErrorCode status = U_ZERO_ERROR;
2916     int32_t    pos;       //  Index of the char following a potential break position
2917     UChar32    thisChar;  //  Character at above position "pos"
2918
2919     int32_t    prevPos;   //  Index of the char preceding a potential break position
2920     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2921                           //   and thisChar may not be adjacent because combining
2922                           //   characters between them will be ignored.
2923
2924     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2925     UChar32    prevCharX2;
2926
2927     int32_t    nextPos;   //  Index of the next character following pos.
2928                           //     Usually skips over combining marks.
2929     int32_t    nextCPPos; //  Index of the code point following "pos."
2930                           //     May point to a combining mark.
2931     int32_t    tPos;      //  temp value.
2932     UChar32    c;
2933
2934     if (U_FAILURE(deferredStatus)) {
2935         return -1;
2936     }
2937
2938     if (startPos >= fText->length()) {
2939         return -1;
2940     }
2941
2942
2943     // Initial values for loop.  Loop will run the first time without finding breaks,
2944     //                           while the invalid values shift out and the "this" and
2945     //                           "prev" positions are filled in with good values.
2946     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2947     thisChar = prevChar  = prevCharX2 = 0;
2948     nextPos  = nextCPPos = startPos;
2949
2950
2951     // Loop runs once per position in the test text, until a break position
2952     //  is found.
2953     for (;;) {
2954         prevPosX2 = prevPos;
2955         prevCharX2 = prevChar;
2956
2957         prevPos   = pos;
2958         prevChar  = thisChar;
2959
2960         pos       = nextPos;
2961         thisChar  = fText->char32At(pos);
2962
2963         nextCPPos = fText->moveIndex32(pos, 1);
2964         nextPos   = nextCPPos;
2965
2966         // Rule LB2 - Break at end of text.
2967         if (pos >= fText->length()) {
2968             break;
2969         }
2970
2971         // Rule LB 9 - adjust for combining sequences.
2972         //             We do this one out-of-order because the adjustment does not change anything
2973         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
2974         //             be applied.
2975         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
2976         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
2977         c = fText->char32At(nextPos);
2978         rule9Adjust(pos,     &thisChar, &nextPos, &c);
2979
2980         // If the loop is still warming up - if we haven't shifted the initial
2981         //   -1 positions out of prevPos yet - loop back to advance the
2982         //    position in the input without any further looking for breaks.
2983         if (prevPos == -1) {
2984             continue;
2985         }
2986
2987         // LB 4  Always break after hard line breaks,
2988         if (fBK->contains(prevChar)) {
2989             break;
2990         }
2991
2992         // LB 5  Break after CR, LF, NL, but not inside CR LF
2993         if (prevChar == 0x0d && thisChar == 0x0a) {
2994             continue;
2995         }
2996         if (prevChar == 0x0d ||
2997             prevChar == 0x0a ||
2998             prevChar == 0x85)  {
2999             break;
3000         }
3001
3002         // LB 6  Don't break before hard line breaks
3003         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3004             fBK->contains(thisChar)) {
3005                 continue;
3006         }
3007
3008
3009         // LB 7  Don't break before spaces or zero-width space.
3010         if (fSP->contains(thisChar)) {
3011             continue;
3012         }
3013
3014         if (fZW->contains(thisChar)) {
3015             continue;
3016         }
3017
3018         // LB 8  Break after zero width space
3019         if (fZW->contains(prevChar)) {
3020             break;
3021         }
3022
3023         // LB 9, 10  Already done, at top of loop.
3024         //
3025
3026
3027         // LB 11  Do not break before or after WORD JOINER and related characters.
3028         //    x  WJ
3029         //    WJ  x
3030         //
3031         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3032             continue;
3033         }
3034
3035         // LB 12
3036         //    GL  x
3037         if (fGL->contains(prevChar)) {
3038             continue;
3039         }
3040
3041         // LB 12a
3042         //    [^SP BA HY] x GL
3043         if (!(fSP->contains(prevChar) ||
3044               fBA->contains(prevChar) ||
3045               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3046             continue;
3047         }
3048
3049
3050
3051         // LB 13  Don't break before closings.
3052         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3053         //        fall into LB 17 and the more general number regular expression.
3054         //
3055         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3056             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3057                                          fEX->contains(thisChar)  ||
3058             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3059             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3060             continue;
3061         }
3062
3063         // LB 14 Don't break after OP SP*
3064         //       Scan backwards, checking for this sequence.
3065         //       The OP char could include combining marks, so we actually check for
3066         //           OP CM* SP*
3067         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3068         //       sequence into a ID char, so before scanning back through spaces,
3069         //       verify that prevChar is indeed a space.  The prevChar variable
3070         //       may differ from fText[prevPos]
3071         tPos = prevPos;
3072         if (fSP->contains(prevChar)) {
3073             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3074                 tPos=fText->moveIndex32(tPos, -1);
3075             }
3076         }
3077         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3078             tPos=fText->moveIndex32(tPos, -1);
3079         }
3080         if (fOP->contains(fText->char32At(tPos))) {
3081             continue;
3082         }
3083
3084
3085         // LB 15    QU SP* x OP
3086         if (fOP->contains(thisChar)) {
3087             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3088             int tPos = prevPos;
3089             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3090                 tPos = fText->moveIndex32(tPos, -1);
3091             }
3092             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3093                 tPos = fText->moveIndex32(tPos, -1);
3094             }
3095             if (fQU->contains(fText->char32At(tPos))) {
3096                 continue;
3097             }
3098         }
3099
3100
3101
3102         // LB 16   (CL | CP) SP* x NS
3103         //    Scan backwards for SP* CM* (CL | CP)
3104         if (fNS->contains(thisChar)) {
3105             int tPos = prevPos;
3106             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3107                 tPos = fText->moveIndex32(tPos, -1);
3108             }
3109             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3110                 tPos = fText->moveIndex32(tPos, -1);
3111             }
3112             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3113                 continue;
3114             }
3115         }
3116
3117
3118         // LB 17        B2 SP* x B2
3119         if (fB2->contains(thisChar)) {
3120             //  Scan backwards, checking for the B2 CM* SP* sequence.
3121             tPos = prevPos;
3122             if (fSP->contains(prevChar)) {
3123                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3124                     tPos=fText->moveIndex32(tPos, -1);
3125                 }
3126             }
3127             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3128                 tPos=fText->moveIndex32(tPos, -1);
3129             }
3130             if (fB2->contains(fText->char32At(tPos))) {
3131                 continue;
3132             }
3133         }
3134
3135
3136         // LB 18    break after space
3137         if (fSP->contains(prevChar)) {
3138             break;
3139         }
3140
3141         // LB 19
3142         //    x   QU
3143         //    QU  x
3144         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3145             continue;
3146         }
3147
3148         // LB 20  Break around a CB
3149         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3150             break;
3151         }
3152
3153         // LB 21
3154         if (fBA->contains(thisChar) ||
3155             fHY->contains(thisChar) ||
3156             fNS->contains(thisChar) ||
3157             fBB->contains(prevChar) )   {
3158             continue;
3159         }
3160
3161         // LB 21a
3162         //   HL (HY | BA) x
3163         if (fHL->contains(prevCharX2) &&
3164                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3165             continue;
3166         }
3167
3168         // LB 21b - Added for Apple 13927604
3169         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3170             continue;
3171         }
3172
3173         // LB 22
3174         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3175             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3176             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3177             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3178             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3179             continue;
3180         }
3181
3182
3183         // LB 23    ID x PO
3184         //          AL x NU
3185         //          HL x NU
3186         //          NU x AL
3187         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3188             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3189             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3190             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3191             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3192             continue;
3193         }
3194
3195         // LB 24  Do not break between prefix and letters or ideographs.
3196         //        PR x ID
3197         //        PR x (AL | HL)
3198         //        PO x (AL | HL)
3199         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3200             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3201             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3202             continue;
3203         }
3204
3205
3206
3207         // LB 25    Numbers
3208         if (fNumberMatcher->lookingAt(prevPos, status)) {
3209             if (U_FAILURE(status)) {
3210                 break;
3211             }
3212             // Matched a number.  But could have been just a single digit, which would
3213             //    not represent a "no break here" between prevChar and thisChar
3214             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3215             if (numEndIdx > pos) {
3216                 // Number match includes at least our two chars being checked
3217                 if (numEndIdx > nextPos) {
3218                     // Number match includes additional chars.  Update pos and nextPos
3219                     //   so that next loop iteration will continue at the end of the number,
3220                     //   checking for breaks between last char in number & whatever follows.
3221                     pos = nextPos = numEndIdx;
3222                     do {
3223                         pos = fText->moveIndex32(pos, -1);
3224                         thisChar = fText->char32At(pos);
3225                     } while (fCM->contains(thisChar));
3226                 }
3227                 continue;
3228             }
3229         }
3230
3231
3232         // LB 26 Do not break a Korean syllable.
3233         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3234                                         fJV->contains(thisChar) ||
3235                                         fH2->contains(thisChar) ||
3236                                         fH3->contains(thisChar))) {
3237                                             continue;
3238                                         }
3239
3240         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3241             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3242                 continue;
3243         }
3244
3245         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3246             fJT->contains(thisChar)) {
3247                 continue;
3248         }
3249
3250         // LB 27 Treat a Korean Syllable Block the same as ID.
3251         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3252             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3253             fIN->contains(thisChar)) {
3254                 continue;
3255             }
3256         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3257             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3258             fPO->contains(thisChar)) {
3259                 continue;
3260             }
3261         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3262             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3263                 continue;
3264             }
3265
3266
3267
3268         // LB 28  Do not break between alphabetics ("at").
3269         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3270             continue;
3271         }
3272
3273         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3274         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3275             continue;
3276         }
3277
3278         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3279         //          (AL | NU) x OP
3280         //          CP x (AL | NU)
3281         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3282             continue;
3283         }
3284         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3285             continue;
3286         }
3287
3288         // LB30a  Do not break between regional indicators.
3289         //        RI x RI
3290         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3291             continue;
3292         }
3293
3294         // LB 31    Break everywhere else
3295         break;
3296
3297     }
3298
3299     return pos;
3300 }
3301
3302
3303 UVector  *RBBILineMonkey::charClasses() {
3304     return fSets;
3305 }
3306
3307
3308 RBBILineMonkey::~RBBILineMonkey() {
3309     delete fSets;
3310
3311     delete fBK;
3312     delete fCR;
3313     delete fLF;
3314     delete fCM;
3315     delete fNL;
3316     delete fWJ;
3317     delete fZW;
3318     delete fGL;
3319     delete fCB;
3320     delete fSP;
3321     delete fB2;
3322     delete fBA;
3323     delete fBB;
3324     delete fHY;
3325     delete fH2;
3326     delete fH3;
3327     delete fCL;
3328     delete fCP;
3329     delete fEX;
3330     delete fIN;
3331     delete fJL;
3332     delete fJV;
3333     delete fJT;
3334     delete fNS;
3335     delete fOP;
3336     delete fQU;
3337     delete fIS;
3338     delete fNU;
3339     delete fPO;
3340     delete fPR;
3341     delete fSY;
3342     delete fAI;
3343     delete fAL;
3344     delete fCJ;
3345     delete fHL;
3346     delete fID;
3347     delete fRI;
3348     delete fSA;
3349     delete fSG;
3350     delete fXX;
3351
3352     delete fCharBI;
3353     delete fNumberMatcher;
3354 }
3355
3356
3357 //-------------------------------------------------------------------------------------------
3358 //
3359 //   TestMonkey
3360 //
3361 //     params
3362 //       seed=nnnnn        Random number starting seed.
3363 //                         Setting the seed allows errors to be reproduced.
3364 //       loop=nnn          Looping count.  Controls running time.
3365 //                         -1:  run forever.
3366 //                          0 or greater:  run length.
3367 //
3368 //       type = char | word | line | sent | title
3369 //
3370 //-------------------------------------------------------------------------------------------
3371
3372 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3373     int32_t val = defaultVal;
3374     name.append(" *= *(-?\\d+)");
3375     UErrorCode status = U_ZERO_ERROR;
3376     RegexMatcher m(name, params, 0, status);
3377     if (m.find()) {
3378         // The param exists.  Convert the string to an int.
3379         char valString[100];
3380         int32_t paramLength = m.end(1, status) - m.start(1, status);
3381         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3382             paramLength = (int32_t)(sizeof(valString)-2);
3383         }
3384         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3385         val = strtol(valString,  NULL, 10);
3386
3387         // Delete this parameter from the params string.
3388         m.reset();
3389         params = m.replaceFirst("", status);
3390     }
3391     U_ASSERT(U_SUCCESS(status));
3392     return val;
3393 }
3394 #endif
3395
3396 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3397 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3398                                     BreakIterator *bi,
3399                                     int expected[],
3400                                     int expectedcount)
3401 {
3402     int count = 0;
3403     int i = 0;
3404     int forward[50];
3405     bi->setText(ustr);
3406     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3407         forward[count] = i;
3408         if (count < expectedcount && expected[count] != i) {
3409             test->errln("break forward test failed: expected %d but got %d",
3410                         expected[count], i);
3411             break;
3412         }
3413         count ++;
3414     }
3415     if (count != expectedcount) {
3416         printStringBreaks(ustr, expected, expectedcount);
3417         test->errln("break forward test failed: missed %d match",
3418                     expectedcount - count);
3419         return;
3420     }
3421     // testing boundaries
3422     for (i = 1; i < expectedcount; i ++) {
3423         int j = expected[i - 1];
3424         if (!bi->isBoundary(j)) {
3425             printStringBreaks(ustr, expected, expectedcount);
3426             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3427             return;
3428         }
3429         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3430             if (bi->isBoundary(j)) {
3431                 printStringBreaks(ustr, expected, expectedcount);
3432                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3433                 return;
3434             }
3435         }
3436     }
3437
3438     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3439         count --;
3440         if (forward[count] != i) {
3441             printStringBreaks(ustr, expected, expectedcount);
3442             test->errln("happy break test previous() failed: expected %d but got %d",
3443                         forward[count], i);
3444             break;
3445         }
3446     }
3447     if (count != 0) {
3448         printStringBreaks(ustr, expected, expectedcount);
3449         test->errln("break test previous() failed: missed a match");
3450         return;
3451     }
3452
3453     // testing preceding
3454     for (i = 0; i < expectedcount - 1; i ++) {
3455         // int j = expected[i] + 1;
3456         int j = ustr.moveIndex32(expected[i], 1);
3457         for (; j <= expected[i + 1]; j ++) {
3458             if (bi->preceding(j) != expected[i]) {
3459                 printStringBreaks(ustr, expected, expectedcount);
3460                 test->errln("preceding(): Not expecting boundary at position %d", j);
3461                 return;
3462             }
3463         }
3464     }
3465 }
3466 #endif
3467
3468 void RBBITest::TestWordBreaks(void)
3469 {
3470 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3471
3472     Locale        locale("en");
3473     UErrorCode    status = U_ZERO_ERROR;
3474     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3475     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3476     // Replaced any C+J characters in a row with a random sequence of characters
3477     // of the same length to make our C+J segmentation not get in the way.
3478     static const char *strlist[] =
3479     {
3480     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3481     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3482     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3483     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3484     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3485     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3486     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3487     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3488     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3489     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3490     "\\u2027\\U000e0067\\u0a47\\u00b7",
3491     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3492     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3493     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3494     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3495     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3496     "\\u0027\\u11af\\U000e0057\\u0602",
3497     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3498     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3499     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3500     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3501     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3502     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3503     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3504     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3505     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3506     "\\u18f4\\U000e0049\\u20e7\\u2027",
3507     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3508     "\\ua183\\u102d\\u0bec\\u003a",
3509     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3510     "\\u003a\\u0e57\\u0fad\\u002e",
3511     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3512     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3513     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3514     "\\u003a\\u0664\\u00b7\\u1fba",
3515     "\\u003b\\u0027\\u00b7\\u47a3",
3516     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3517     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3518     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3519     };
3520     int loop;
3521     if (U_FAILURE(status)) {
3522         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3523         return;
3524     }
3525     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3526         // printf("looping %d\n", loop);
3527         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3528         // RBBICharMonkey monkey;
3529         RBBIWordMonkey monkey;
3530
3531         int expected[50];
3532         int expectedcount = 0;
3533
3534         monkey.setText(ustr);
3535         int i;
3536         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3537             expected[expectedcount ++] = i;
3538         }
3539
3540         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3541     }
3542     delete bi;
3543 #endif
3544 }
3545
3546 void RBBITest::TestWordBoundary(void)
3547 {
3548     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3549     Locale        locale("en");
3550     UErrorCode    status = U_ZERO_ERROR;
3551     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3552     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3553     UChar         str[50];
3554     static const char *strlist[] =
3555     {
3556     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3557     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3558     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3559     "\\u2027\\U000e0067\\u0a47\\u00b7",
3560     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3561     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3562     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3563     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3564     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3565     "\\u0027\\u11af\\U000e0057\\u0602",
3566     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3567     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3568     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3569     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3570     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3571     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3572     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3573     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3574     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3575     "\\u58f4\\U000e0049\\u20e7\\u2027",
3576     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3577     "\\ua183\\u102d\\u0bec\\u003a",
3578     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3579     "\\u003a\\u0e57\\u0fad\\u002e",
3580     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3581     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3582     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3583     "\\u003a\\u0664\\u00b7\\u1fba",
3584     "\\u003b\\u0027\\u00b7\\u47a3",
3585     };
3586     int loop;
3587     if (U_FAILURE(status)) {
3588         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3589         return;
3590     }
3591     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3592         // printf("looping %d\n", loop);
3593         u_unescape(strlist[loop], str, 20);
3594         UnicodeString ustr(str);
3595         int forward[50];
3596         int count = 0;
3597
3598         bi->setText(ustr);
3599         int prev = 0;
3600         int i;
3601         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3602             forward[count ++] = i;
3603             if (i > prev) {
3604                 int j;
3605                 for (j = prev + 1; j < i; j ++) {
3606                     if (bi->isBoundary(j)) {
3607                         printStringBreaks(ustr, forward, count);
3608                         errln("happy boundary test failed: expected %d not a boundary",
3609                                j);
3610                         return;
3611                     }
3612                 }
3613             }
3614             if (!bi->isBoundary(i)) {
3615                 printStringBreaks(ustr, forward, count);
3616                 errln("happy boundary test failed: expected %d a boundary",
3617                        i);
3618                 return;
3619             }
3620             prev = i;
3621         }
3622     }
3623     delete bi;
3624 }
3625
3626 void RBBITest::TestLineBreaks(void)
3627 {
3628 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3629     Locale        locale("en");
3630     UErrorCode    status = U_ZERO_ERROR;
3631     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3632     const int32_t  STRSIZE = 50;
3633     UChar         str[STRSIZE];
3634     static const char *strlist[] =
3635     {
3636      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3637      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3638              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3639      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3640              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3641      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3642      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3643      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3644      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3645      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3646      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3647      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3648      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3649      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3650      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3651      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3652      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3653      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3654      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3655      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3656      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3657      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3658      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3659      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3660      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3661      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3662      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3663      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3664      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3665      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3666      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3667      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3668      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3669      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3670      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3671      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3672      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3673      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3674      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3675      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3676      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3677      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3678          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3679          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3680          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3681      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3682          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3683     };
3684     int loop;
3685     TEST_ASSERT_SUCCESS(status);
3686     if (U_FAILURE(status)) {
3687         return;
3688     }
3689     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3690         // printf("looping %d\n", loop);
3691         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3692         if (t >= STRSIZE) {
3693             TEST_ASSERT(FALSE);
3694             continue;
3695         }
3696
3697
3698         UnicodeString ustr(str);
3699         RBBILineMonkey monkey;
3700         if (U_FAILURE(monkey.deferredStatus)) {
3701             continue;
3702         }
3703
3704         const int EXPECTEDSIZE = 50;
3705         int expected[EXPECTEDSIZE];
3706         int expectedcount = 0;
3707
3708         monkey.setText(ustr);
3709         int i;
3710         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3711             if (expectedcount >= EXPECTEDSIZE) {
3712                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3713                 return;
3714             }
3715             expected[expectedcount ++] = i;
3716         }
3717
3718         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3719     }
3720     delete bi;
3721 #endif
3722 }
3723
3724 void RBBITest::TestSentBreaks(void)
3725 {
3726 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3727     Locale        locale("en");
3728     UErrorCode    status = U_ZERO_ERROR;
3729     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3730     UChar         str[200];
3731     static const char *strlist[] =
3732     {
3733      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3734      "This\n",
3735      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3736      "\"Sentence ending with a quote.\" Bye.",
3737      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3738      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3739      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3740      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3741      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3742      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3743      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3744              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3745              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3746              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3747      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3748              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3749              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3750              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3751              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3752              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3753     };
3754     int loop;
3755     if (U_FAILURE(status)) {
3756         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3757         return;
3758     }
3759     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3760         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3761         UnicodeString ustr(str);
3762
3763         RBBISentMonkey monkey;
3764         if (U_FAILURE(monkey.deferredStatus)) {
3765             continue;
3766         }
3767
3768         const int EXPECTEDSIZE = 50;
3769         int expected[EXPECTEDSIZE];
3770         int expectedcount = 0;
3771
3772         monkey.setText(ustr);
3773         int i;
3774         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3775             if (expectedcount >= EXPECTEDSIZE) {
3776                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3777                 return;
3778             }
3779             expected[expectedcount ++] = i;
3780         }
3781
3782         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3783     }
3784     delete bi;
3785 #endif
3786 }
3787
3788 void RBBITest::TestMonkey(char *params) {
3789 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3790
3791     UErrorCode     status    = U_ZERO_ERROR;
3792     int32_t        loopCount = 500;
3793     int32_t        seed      = 1;
3794     UnicodeString  breakType = "all";
3795     Locale         locale("en");
3796     UBool          useUText  = FALSE;
3797
3798     if (quick == FALSE) {
3799         loopCount = 10000;
3800     }
3801
3802     if (params) {
3803         UnicodeString p(params);
3804         loopCount = getIntParam("loop", p, loopCount);
3805         seed      = getIntParam("seed", p, seed);
3806
3807         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3808         if (m.find()) {
3809             breakType = m.group(1, status);
3810             m.reset();
3811             p = m.replaceFirst("", status);
3812         }
3813
3814         RegexMatcher u(" *utext", p, 0, status);
3815         if (u.find()) {
3816             useUText = TRUE;
3817             u.reset();
3818             p = u.replaceFirst("", status);
3819         }
3820
3821
3822         // m.reset(p);
3823         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3824             // Each option is stripped out of the option string as it is processed.
3825             // All options have been checked.  The option string should have been completely emptied..
3826             char buf[100];
3827             p.extract(buf, sizeof(buf), NULL, status);
3828             buf[sizeof(buf)-1] = 0;
3829             errln("Unrecognized or extra parameter:  %s\n", buf);
3830             return;
3831         }
3832
3833     }
3834
3835     if (breakType == "char" || breakType == "all") {
3836         RBBICharMonkey  m;
3837         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3838         if (U_SUCCESS(status)) {
3839             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3840             if (breakType == "all" && useUText==FALSE) {
3841                 // Also run a quick test with UText when "all" is specified
3842                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3843             }
3844         }
3845         else {
3846             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3847         }
3848         delete bi;
3849     }
3850
3851     if (breakType == "word" || breakType == "all") {
3852         logln("Word Break Monkey Test");
3853         RBBIWordMonkey  m;
3854         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3855         if (U_SUCCESS(status)) {
3856             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3857         }
3858         else {
3859             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3860         }
3861         delete bi;
3862     }
3863
3864     if (breakType == "line" || breakType == "all") {
3865         logln("Line Break Monkey Test");
3866         RBBILineMonkey  m;
3867         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3868         if (loopCount >= 10) {
3869             loopCount = loopCount / 5;   // Line break runs slower than the others.
3870         }
3871         if (U_SUCCESS(status)) {
3872             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3873         }
3874         else {
3875             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3876         }
3877         delete bi;
3878     }
3879
3880     if (breakType == "sent" || breakType == "all"  ) {
3881         logln("Sentence Break Monkey Test");
3882         RBBISentMonkey  m;
3883         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3884         if (loopCount >= 10) {
3885             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3886         }
3887         if (U_SUCCESS(status)) {
3888             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3889         }
3890         else {
3891             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3892         }
3893         delete bi;
3894     }
3895
3896 #endif
3897 }
3898
3899 //
3900 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3901 //    Parameters:
3902 //       bi      - the break iterator to use
3903 //       mk      - MonkeyKind, abstraction for obtaining expected results
3904 //       name    - Name of test (char, word, etc.) for use in error messages
3905 //       seed    - Seed for starting random number generator (parameter from user)
3906 //       numIterations
3907 //
3908 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3909                          int32_t numIterations, UBool useUText) {
3910
3911 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3912
3913     const int32_t    TESTSTRINGLEN = 500;
3914     UnicodeString    testText;
3915     int32_t          numCharClasses;
3916     UVector          *chClasses;
3917     int              expected[TESTSTRINGLEN*2 + 1];
3918     int              expectedCount = 0;
3919     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3920     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3921     char             reverseBreaks[TESTSTRINGLEN*2+1];
3922     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3923     char             followingBreaks[TESTSTRINGLEN*2+1];
3924     char             precedingBreaks[TESTSTRINGLEN*2+1];
3925     int              i;
3926     int              loopCount = 0;
3927
3928     m_seed = seed;
3929
3930     numCharClasses = mk.charClasses()->size();
3931     chClasses      = mk.charClasses();
3932
3933     // Check for errors that occured during the construction of the MonkeyKind object.
3934     //  Can't report them where they occured because errln() is a method coming from intlTest,
3935     //  and is not visible outside of RBBITest :-(
3936     if (U_FAILURE(mk.deferredStatus)) {
3937         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3938         return;
3939     }
3940
3941     // Verify that the character classes all have at least one member.
3942     for (i=0; i<numCharClasses; i++) {
3943         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3944         if (s == NULL || s->size() == 0) {
3945             errln("Character Class #%d is null or of zero size.", i);
3946             return;
3947         }
3948     }
3949
3950     while (loopCount < numIterations || numIterations == -1) {
3951         if (numIterations == -1 && loopCount % 10 == 0) {
3952             // If test is running in an infinite loop, display a periodic tic so
3953             //   we can tell that it is making progress.
3954             fprintf(stderr, ".");
3955         }
3956         // Save current random number seed, so that we can recreate the random numbers
3957         //   for this loop iteration in event of an error.
3958         seed = m_seed;
3959
3960         // Populate a test string with data.
3961         testText.truncate(0);
3962         for (i=0; i<TESTSTRINGLEN; i++) {
3963             int32_t  aClassNum = m_rand() % numCharClasses;
3964             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3965             int32_t   charIdx = m_rand() % classSet->size();
3966             UChar32   c = classSet->charAt(charIdx);
3967             if (c < 0) {   // TODO:  deal with sets containing strings.
3968                 errln("c < 0");
3969                 break;
3970             }
3971             testText.append(c);
3972         }
3973
3974         // Calculate the expected results for this test string.
3975         mk.setText(testText);
3976         memset(expectedBreaks, 0, sizeof(expectedBreaks));
3977         expectedBreaks[0] = 1;
3978         int32_t breakPos = 0;
3979         expectedCount = 0;
3980         for (;;) {
3981             breakPos = mk.next(breakPos);
3982             if (breakPos == -1) {
3983                 break;
3984             }
3985             if (breakPos > testText.length()) {
3986                 errln("breakPos > testText.length()");
3987             }
3988             expectedBreaks[breakPos] = 1;
3989             U_ASSERT(expectedCount<testText.length());
3990             expected[expectedCount ++] = breakPos;
3991         }
3992
3993         // Find the break positions using forward iteration
3994         memset(forwardBreaks, 0, sizeof(forwardBreaks));
3995         if (useUText) {
3996             UErrorCode status = U_ZERO_ERROR;
3997             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
3998             // testUText = utext_openUnicodeString(testUText, &testText, &status);
3999             bi->setText(testUText, status);
4000             TEST_ASSERT_SUCCESS(status);
4001             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4002                                       //  This UText can be closed immediately, so long as the
4003                                       //  testText string continues to exist.
4004         } else {
4005             bi->setText(testText);
4006         }
4007
4008         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4009             if (i < 0 || i > testText.length()) {
4010                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4011                 break;
4012             }
4013             forwardBreaks[i] = 1;
4014         }
4015
4016         // Find the break positions using reverse iteration
4017         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4018         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4019             if (i < 0 || i > testText.length()) {
4020                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4021                 break;
4022             }
4023             reverseBreaks[i] = 1;
4024         }
4025
4026         // Find the break positions using isBoundary() tests.
4027         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4028         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4029         for (i=0; i<=testText.length(); i++) {
4030             isBoundaryBreaks[i] = bi->isBoundary(i);
4031         }
4032
4033
4034         // Find the break positions using the following() function.
4035         // printf(".");
4036         memset(followingBreaks, 0, sizeof(followingBreaks));
4037         int32_t   lastBreakPos = 0;
4038         followingBreaks[0] = 1;
4039         for (i=0; i<testText.length(); i++) {
4040             breakPos = bi->following(i);
4041             if (breakPos <= i ||
4042                 breakPos < lastBreakPos ||
4043                 breakPos > testText.length() ||
4044                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4045                 UChar32 brkChar = testText.char32At(lastBreakPos);
4046                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4047                 errln("%s break monkey test: "
4048                     "Out of range value returned by BreakIterator::following().\n"
4049                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4050                          name, seed, i, breakPos, lastBreakPos);
4051                 }
4052                 break;
4053             }
4054             followingBreaks[breakPos] = 1;
4055             lastBreakPos = breakPos;
4056         }
4057
4058         // Find the break positions using the preceding() function.
4059         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4060         lastBreakPos = testText.length();
4061         precedingBreaks[testText.length()] = 1;
4062         for (i=testText.length(); i>0; i--) {
4063             breakPos = bi->preceding(i);
4064             if (breakPos >= i ||
4065                 breakPos > lastBreakPos ||
4066                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4067                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4068                 UChar32 brkChar = testText.char32At(breakPos);
4069                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4070                 errln("%s break monkey test: "
4071                     "Out of range value returned by BreakIterator::preceding().\n"
4072                     "index=%d;  prev returned %d; lastBreak=%d" ,
4073                     name,  i, breakPos, lastBreakPos);
4074                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4075                     precedingBreaks[i] = 2;   // Forces an error.
4076                 }
4077                 }
4078             } else {
4079                 if (breakPos >= 0) {
4080                     precedingBreaks[breakPos] = 1;
4081                 }
4082                 lastBreakPos = breakPos;
4083             }
4084         }
4085
4086         // Compare the expected and actual results.
4087         for (i=0; i<=testText.length(); i++) {
4088             const char *errorType = NULL;
4089             if  (forwardBreaks[i] != expectedBreaks[i]) {
4090                 errorType = "next()";
4091             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4092                 errorType = "previous()";
4093             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4094                 errorType = "isBoundary()";
4095             } else if (followingBreaks[i] != expectedBreaks[i]) {
4096                 errorType = "following()";
4097             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4098                 errorType = "preceding()";
4099             }
4100
4101
4102             if (errorType != NULL) {
4103                 // Format a range of the test text that includes the failure as
4104                 //  a data item that can be included in the rbbi test data file.
4105
4106                 // Start of the range is the last point where expected and actual results
4107                 //   both agreed that there was a break position.
4108                 int startContext = i;
4109                 int32_t count = 0;
4110                 for (;;) {
4111                     if (startContext==0) { break; }
4112                     startContext --;
4113                     if (expectedBreaks[startContext] != 0) {
4114                         if (count == 2) break;
4115                         count ++;
4116                     }
4117                 }
4118
4119                 // End of range is two expected breaks past the start position.
4120                 int endContext = i + 1;
4121                 int ci;
4122                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4123                     for (;;) {
4124                         if (endContext >= testText.length()) {break;}
4125                         if (expectedBreaks[endContext-1] != 0) {
4126                             if (count == 0) break;
4127                             count --;
4128                         }
4129                         endContext ++;
4130                     }
4131                 }
4132
4133                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4134                 UnicodeString errorText = "<data>";
4135                 /***if (strcmp(errorType, "next()") == 0) {
4136                     startContext = 0;
4137                     endContext = testText.length();
4138
4139                     printStringBreaks(testText, expected, expectedCount);
4140                 }***/
4141
4142                 for (ci=startContext; ci<endContext;) {
4143                     UnicodeString hexChars("0123456789abcdef");
4144                     UChar32  c;
4145                     int      bn;
4146                     c = testText.char32At(ci);
4147                     if (ci == i) {
4148                         // This is the location of the error.
4149                         errorText.append("<?>");
4150                     } else if (expectedBreaks[ci] != 0) {
4151                         // This a non-error expected break position.
4152                         errorText.append("\\");
4153                     }
4154                     if (c < 0x10000) {
4155                         errorText.append("\\u");
4156                         for (bn=12; bn>=0; bn-=4) {
4157                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4158                         }
4159                     } else {
4160                         errorText.append("\\U");
4161                         for (bn=28; bn>=0; bn-=4) {
4162                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4163                         }
4164                     }
4165                     ci = testText.moveIndex32(ci, 1);
4166                 }
4167                 errorText.append("\\");
4168                 errorText.append("</data>\n");
4169
4170                 // Output the error
4171                 char  charErrorTxt[500];
4172                 UErrorCode status = U_ZERO_ERROR;
4173                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4174                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4175                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4176
4177                 UChar32 brkChar = testText.char32At(i);
4178                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4179                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4180                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4181                     errorType, seed, i, charErrorTxt);
4182                 }
4183                 break;
4184             }
4185         }
4186
4187         loopCount++;
4188     }
4189 #endif
4190 }
4191
4192
4193 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4194 //             This test checks the initial patch,
4195 //             which is to just keep it from crashing.  Correct word boundaries
4196 //             await a proper fix to the dictionary code.
4197 //
4198 void RBBITest::TestBug5532(void)  {
4199    // Text includes a mixture of Thai and Latin.
4200    const unsigned char utf8Data[] = {
4201            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4202            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4203            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4204            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4205            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4206            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4207            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4208            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4209            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4210            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4211            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4212
4213     UErrorCode status = U_ZERO_ERROR;
4214     UText utext=UTEXT_INITIALIZER;
4215     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4216     TEST_ASSERT_SUCCESS(status);
4217
4218     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4219     TEST_ASSERT_SUCCESS(status);
4220     if (U_SUCCESS(status)) {
4221         bi->setText(&utext, status);
4222         TEST_ASSERT_SUCCESS(status);
4223
4224         int32_t breakCount = 0;
4225         int32_t previousBreak = -1;
4226         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4227             // For now, just make sure that the break iterator doesn't hang.
4228             TEST_ASSERT(previousBreak < bi->current());
4229             previousBreak = bi->current();
4230         }
4231         TEST_ASSERT(breakCount > 0);
4232     }
4233     delete bi;
4234     utext_close(&utext);
4235 }
4236
4237
4238 void RBBITest::TestBug9983(void)  {
4239     UnicodeString text = UnicodeString("\\u002A"  // * Other
4240                                        "\\uFF65"  //   Other
4241                                        "\\u309C"  //   Katakana
4242                                        "\\uFF9F"  //   Extend
4243                                        "\\uFF65"  //   Other
4244                                        "\\u0020"  //   Other
4245                                        "\\u0000").unescape();
4246
4247     UErrorCode status = U_ZERO_ERROR;
4248     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4249         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4250     TEST_ASSERT_SUCCESS(status);
4251     if (U_FAILURE(status)) {
4252         return;
4253     }
4254     brkiter->setText(text);
4255     int32_t offset, rstatus;
4256     brkiter->last();
4257     int32_t iterationCount = 0;
4258     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4259         iterationCount++;
4260         rstatus = brkiter->getRuleStatus();
4261         // printf(" %d(%d)", offset, rstatus);
4262         if (iterationCount >= 10) {
4263            break;
4264         }
4265     }
4266     TEST_ASSERT(iterationCount == 6);
4267 }
4268
4269
4270 //
4271 //  TestDebug    -  A place-holder test for debugging purposes.
4272 //                  For putting in fragments of other tests that can be invoked
4273 //                  for tracing  without a lot of unwanted extra stuff happening.
4274 //
4275 void RBBITest::TestDebug(void) {
4276 #if 0
4277     UErrorCode   status = U_ZERO_ERROR;
4278     int pos = 0;
4279     int ruleStatus = 0;
4280
4281     RuleBasedBreakIterator* bi =
4282        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4283        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4284        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4285     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4286     // UnicodeString s("Aaa.  Bcd");
4287     s = s.unescape();
4288     bi->setText(s);
4289     UBool r = bi->isBoundary(8);
4290     printf("%s", r?"true":"false");
4291     return;
4292     pos = bi->last();
4293     do {
4294         // ruleStatus = bi->getRuleStatus();
4295         printf("%d\t%d\n", pos, ruleStatus);
4296         pos = bi->previous();
4297     } while (pos != BreakIterator::DONE);
4298 #endif
4299 }
4300
4301 void RBBITest::TestProperties() {
4302     UErrorCode errorCode = U_ZERO_ERROR;
4303     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4304     if (!prependSet.isEmpty()) {
4305         errln(
4306             "[:GCB=Prepend:] is not empty any more. "
4307             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4308             "change this test to the opposite condition.");
4309     }
4310 }
4311
4312 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */