icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2013, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "utypeinfo.h"  // for 'typeid' to work
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_BREAK_ITERATION
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/brkiter.h"
  20 #include "unicode/rbbi.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/utf16.h"
  23 #include "unicode/ucnv.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uniset.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27 #include "unicode/regex.h"
  28 #endif
  29 #include "unicode/ustring.h"
  30 #include "unicode/utext.h"
  31 #include "intltest.h"
  32 #include "rbbitst.h"
  33 #include <string.h>
  34 #include "uvector.h"
  35 #include "uvectr32.h"
  36 #include <string.h>
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include "unicode/numfmt.h"
  40 #include "unicode/uscript.h"
  41
  42 #define TEST_ASSERT(x) {if (!(x)) { \
  43     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  44
  45 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  46     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  47
  48
  49 //---------------------------------------------
  50 // runIndexedTest
  51 //---------------------------------------------
  52
  53
  54 //  Note:  Before adding new tests to this file, check whether the desired test data can
  55 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  56 //         it's much less work than writing a new test, diagnostic output in the event of failures
  57 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  58 //         will run there as well, without additional effort.
  59
  60 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  61 {
  62     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  63
  64     switch (index) {
  65 #if !UCONFIG_NO_FILE_IO
  66         case 0: name = "TestBug4153072";
  67             if(exec) TestBug4153072();                         break;
  68 #else
  69         case 0: name = "skip";
  70             break;
  71 #endif
  72
  73         case 1: name = "skip";
  74             break;
  75         case 2: name = "TestStatusReturn";
  76             if(exec) TestStatusReturn();                       break;
  77
  78 #if !UCONFIG_NO_FILE_IO
  79         case 3: name = "TestUnicodeFiles";
  80             if(exec) TestUnicodeFiles();                       break;
  81         case 4: name = "TestEmptyString";
  82             if(exec) TestEmptyString();                        break;
  83 #else
  84         case 3: case 4: name = "skip";
  85             break;
  86 #endif
  87
  88         case 5: name = "TestGetAvailableLocales";
  89             if(exec) TestGetAvailableLocales();                break;
  90
  91         case 6: name = "TestGetDisplayName";
  92             if(exec) TestGetDisplayName();                     break;
  93
  94 #if !UCONFIG_NO_FILE_IO
  95         case 7: name = "TestEndBehaviour";
  96             if(exec) TestEndBehaviour();                       break;
  97         case 8: case 9: case 10: name = "skip";
  98              break;
  99         case 11: name = "TestWordBreaks";
 100              if(exec) TestWordBreaks();                        break;
 101         case 12: name = "TestWordBoundary";
 102              if(exec) TestWordBoundary();                      break;
 103         case 13: name = "TestLineBreaks";
 104              if(exec) TestLineBreaks();                        break;
 105         case 14: name = "TestSentBreaks";
 106              if(exec) TestSentBreaks();                        break;
 107         case 15: name = "TestExtended";
 108              if(exec) TestExtended();                          break;
 109 #else
 110         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 111              break;
 112 #endif
 113
 114 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 115         case 16:
 116             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
 117 #else
 118         case 16:
 119              name = "skip";                                    break;
 120 #endif
 121
 122 #if !UCONFIG_NO_FILE_IO
 123         case 17: name = "TestBug3818";
 124             if(exec) TestBug3818();                            break;
 125 #else
 126         case 17: name = "skip";
 127             break;
 128 #endif
 129
 130         case 18: name = "skip";
 131             break;
 132         case 19: name = "TestDebug";
 133             if(exec) TestDebug();                              break;
 134         case 20: name = "skip";
 135             break;
 136
 137 #if !UCONFIG_NO_FILE_IO
 138         case 21: name = "TestBug5775";
 139             if (exec) TestBug5775();                           break;
 140 #else
 141         case 21: name = "skip";
 142             break;
 143 #endif
 144
 145         case 22: name = "TestBug9983";
 146             if (exec) TestBug9983();                           break;
 147         case 23: name = "TestDictRules";
 148             if (exec) TestDictRules();                         break;
 149         case 24: name = "TestBug5532";
 150             if (exec) TestBug5532();                           break;
 151         default: name = ""; break; //needed to end loop
 152     }
 153 }
 154
 155
 156 //---------------------------------------------------------------------------
 157 //
 158 //   class BITestData   Holds a set of Break iterator test data and results
 159 //                      Includes
 160 //                         - the string data to be broken
 161 //                         - a vector of the expected break positions.
 162 //                         - a vector of source line numbers for the data,
 163 //                               (to help see where errors occured.)
 164 //                         - The expected break tag values.
 165 //                         - Vectors of actual break positions and tag values.
 166 //                         - Functions for comparing actual with expected and
 167 //                            reporting errors.
 168 //
 169 //----------------------------------------------------------------------------
 170 class BITestData {
 171 public:
 172     UnicodeString    fDataToBreak;
 173     UVector          fExpectedBreakPositions;
 174     UVector          fExpectedTags;
 175     UVector          fLineNum;
 176     UVector          fActualBreakPositions;   // Test Results.
 177     UVector          fActualTags;
 178
 179     BITestData(UErrorCode &status);
 180     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 181     void             checkResults(const char *heading, RBBITest *test);
 182     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 183     void             clearResults();
 184 };
 185
 186 //
 187 // Constructor.
 188 //
 189 BITestData::BITestData(UErrorCode &status)
 190 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 191   fActualTags(status)
 192 {
 193 }
 194
 195 //
 196 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 197 //                 The macro form collects the line number, which is helpful
 198 //                 when tracking down failures.
 199 //
 200 //                 A null data item is inserted at the start of each test's data
 201 //                  to put the starting zero into the data list.  The position saved for
 202 //                  each non-null item is its ending position.
 203 //
 204 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 205 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 206     if (U_FAILURE(status)) {return;}
 207     if (data != NULL) {
 208         fDataToBreak.append(CharsToUnicodeString(data));
 209     }
 210     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 211     fExpectedTags.addElement(tag, status);
 212     fLineNum.addElement(lineNum, status);
 213 }
 214
 215
 216 //
 217 //  checkResults.   Compare the actual and expected break positions, report any differences.
 218 //
 219 void BITestData::checkResults(const char *heading, RBBITest *test) {
 220     int32_t   expectedIndex = 0;
 221     int32_t   actualIndex = 0;
 222
 223     for (;;) {
 224         // If we've run through both the expected and actual results vectors, we're done.
 225         //   break out of the loop.
 226         if (expectedIndex >= fExpectedBreakPositions.size() &&
 227             actualIndex   >= fActualBreakPositions.size()) {
 228             break;
 229         }
 230
 231
 232         if (expectedIndex >= fExpectedBreakPositions.size()) {
 233             err(heading, test, expectedIndex-1, actualIndex);
 234             actualIndex++;
 235             continue;
 236         }
 237
 238         if (actualIndex >= fActualBreakPositions.size()) {
 239             err(heading, test, expectedIndex, actualIndex-1);
 240             expectedIndex++;
 241             continue;
 242         }
 243
 244         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 245             err(heading, test, expectedIndex, actualIndex);
 246             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 247             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 248                 actualIndex++;
 249             } else {
 250                 expectedIndex++;
 251             }
 252             continue;
 253         }
 254
 255         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 256             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 257                 heading, fLineNum.elementAt(expectedIndex),
 258                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 259         }
 260
 261         actualIndex++;
 262         expectedIndex++;
 263     }
 264 }
 265
 266 //
 267 //  err   -  An error was found.  Report it, along with information about where the
 268 //                                incorrectly broken test data appeared in the source file.
 269 //
 270 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 271 {
 272     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 273     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 274     int32_t   o        = 0;
 275     int32_t   line     = fLineNum.elementAti(expectedIdx);
 276     if (expectedIdx > 0) {
 277         // The line numbers are off by one because a premature break occurs somewhere
 278         //    within the previous item, rather than at the start of the current (expected) item.
 279         //    We want to report the offset of the unexpected break from the start of
 280         //      this previous item.
 281         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 282     }
 283     if (actual < expected) {
 284         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 285     } else {
 286         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 287     }
 288 }
 289
 290
 291 void BITestData::clearResults() {
 292     fActualBreakPositions.removeAllElements();
 293     fActualTags.removeAllElements();
 294 }
 295
 296
 297 //--------------------------------------------------------------------------------------
 298 //
 299 //    RBBITest    constructor and destructor
 300 //
 301 //--------------------------------------------------------------------------------------
 302
 303 RBBITest::RBBITest() {
 304 }
 305
 306
 307 RBBITest::~RBBITest() {
 308 }
 309
 310 //-----------------------------------------------------------------------------------
 311 //
 312 //   Test for status {tag} return value from break rules.
 313 //        TODO:  a more thorough test.
 314 //
 315 //-----------------------------------------------------------------------------------
 316 void RBBITest::TestStatusReturn() {
 317      UnicodeString rulesString1("$Letters = [:L:];\n"
 318                                   "$Numbers = [:N:];\n"
 319                                   "$Letters+{1};\n"
 320                                   "$Numbers+{2};\n"
 321                                   "Help\\ {4}/me\\!;\n"
 322                                   "[^$Letters $Numbers];\n"
 323                                   "!.*;\n", -1, US_INV);
 324      UnicodeString testString1  = "abc123..abc Help me Help me!";
 325                                 // 01234567890123456789012345678
 326      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 327      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 328
 329      UErrorCode status=U_ZERO_ERROR;
 330      UParseError    parseError;
 331
 332      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 333      if(U_FAILURE(status)) {
 334          dataerrln("FAIL : in construction - %s", u_errorName(status));
 335      } else {
 336          int32_t  pos;
 337          int32_t  i = 0;
 338          bi->setText(testString1);
 339          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 340              if (pos != bounds1[i]) {
 341                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 342                  break;
 343              }
 344
 345              int tag = bi->getRuleStatus();
 346              if (tag != brkStatus[i]) {
 347                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 348                  break;
 349              }
 350              i++;
 351          }
 352      }
 353      delete bi;
 354 }
 355
 356
 357 static void printStringBreaks(UnicodeString ustr, int expected[],
 358                               int expectedcount)
 359 {
 360     UErrorCode status = U_ZERO_ERROR;
 361     char name[100];
 362     printf("code    alpha extend alphanum type word sent line name\n");
 363     int j;
 364     for (j = 0; j < ustr.length(); j ++) {
 365         if (expectedcount > 0) {
 366             int k;
 367             for (k = 0; k < expectedcount; k ++) {
 368                 if (j == expected[k]) {
 369                     printf("------------------------------------------------ %d\n",
 370                            j);
 371                 }
 372             }
 373         }
 374         UChar32 c = ustr.char32At(j);
 375         if (c > 0xffff) {
 376             j ++;
 377         }
 378         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 379         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 380                            u_isUAlphabetic(c),
 381                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 382                            u_isalnum(c),
 383                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 384                                                   u_charType(c),
 385                                                   U_SHORT_PROPERTY_NAME),
 386                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 387                                                   u_getIntPropertyValue(c,
 388                                                           UCHAR_WORD_BREAK),
 389                                                   U_SHORT_PROPERTY_NAME),
 390                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 391                                    u_getIntPropertyValue(c,
 392                                            UCHAR_SENTENCE_BREAK),
 393                                    U_SHORT_PROPERTY_NAME),
 394                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 395                                    u_getIntPropertyValue(c,
 396                                            UCHAR_LINE_BREAK),
 397                                    U_SHORT_PROPERTY_NAME),
 398                            name);
 399     }
 400 }
 401
 402
 403 void RBBITest::TestBug3818() {
 404     UErrorCode  status = U_ZERO_ERROR;
 405
 406     // Four Thai words...
 407     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 408                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 409     UnicodeString  thaiStr(thaiWordData);
 410
 411     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 412     if (U_FAILURE(status) || bi == NULL) {
 413         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 414         return;
 415     }
 416     bi->setText(thaiStr);
 417
 418     int32_t  startOfSecondWord = bi->following(1);
 419     if (startOfSecondWord != 4) {
 420         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 421             __FILE__, __LINE__, startOfSecondWord);
 422     }
 423     startOfSecondWord = bi->following(0);
 424     if (startOfSecondWord != 4) {
 425         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 426             __FILE__, __LINE__, startOfSecondWord);
 427     }
 428     delete bi;
 429 }
 430
 431 //----------------------------------------------------------------------------
 432 //
 433 // generalIteratorTest      Given a break iterator and a set of test data,
 434 //                          Run the tests and report the results.
 435 //
 436 //----------------------------------------------------------------------------
 437 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 438 {
 439
 440     bi.setText(td.fDataToBreak);
 441
 442     testFirstAndNext(bi, td);
 443
 444     testLastAndPrevious(bi, td);
 445
 446     testFollowing(bi, td);
 447     testPreceding(bi, td);
 448     testIsBoundary(bi, td);
 449     doMultipleSelectionTest(bi, td);
 450 }
 451
 452
 453 //
 454 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 455 //                       kind of loop.
 456 //
 457 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 458 {
 459     UErrorCode  status = U_ZERO_ERROR;
 460     int32_t     p;
 461     int32_t     lastP = -1;
 462     int32_t     tag;
 463
 464     logln("Test first and next");
 465     bi.setText(td.fDataToBreak);
 466     td.clearResults();
 467
 468     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 469         td.fActualBreakPositions.addElement(p, status);  // Save result.
 470         tag = bi.getRuleStatus();
 471         td.fActualTags.addElement(tag, status);
 472         if (p <= lastP) {
 473             // If the iterator is not making forward progress, stop.
 474             //  No need to raise an error here, it'll be detected in the normal check of results.
 475             break;
 476         }
 477         lastP = p;
 478     }
 479     td.checkResults("testFirstAndNext", this);
 480 }
 481
 482
 483 //
 484 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 485 //
 486 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 487 {
 488     UErrorCode  status = U_ZERO_ERROR;
 489     int32_t     p;
 490     int32_t     lastP  = 0x7ffffffe;
 491     int32_t     tag;
 492
 493     logln("Test last and previous");
 494     bi.setText(td.fDataToBreak);
 495     td.clearResults();
 496
 497     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 498         // Save break position.  Insert it at start of vector of results, shoving
 499         //    already-saved results further towards the end.
 500         td.fActualBreakPositions.insertElementAt(p, 0, status);
 501         // bi.previous();   // TODO:  Why does this fix things up????
 502         // bi.next();
 503         tag = bi.getRuleStatus();
 504         td.fActualTags.insertElementAt(tag, 0, status);
 505         if (p >= lastP) {
 506             // If the iterator is not making progress, stop.
 507             //  No need to raise an error here, it'll be detected in the normal check of results.
 508             break;
 509         }
 510         lastP = p;
 511     }
 512     td.checkResults("testLastAndPrevious", this);
 513 }
 514
 515
 516 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 517 {
 518     UErrorCode  status = U_ZERO_ERROR;
 519     int32_t     p;
 520     int32_t     tag;
 521     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 522                                  //   cannot be -1; that is returned for DONE.
 523     int         i;
 524
 525     logln("testFollowing():");
 526     bi.setText(td.fDataToBreak);
 527     td.clearResults();
 528
 529     // Save the starting point, since we won't get that out of following.
 530     p = bi.first();
 531     td.fActualBreakPositions.addElement(p, status);  // Save result.
 532     tag = bi.getRuleStatus();
 533     td.fActualTags.addElement(tag, status);
 534
 535     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 536         p = bi.following(i);
 537         if (p != lastP) {
 538             if (p == RuleBasedBreakIterator::DONE) {
 539                 break;
 540             }
 541             // We've reached a new break position.  Save it.
 542             td.fActualBreakPositions.addElement(p, status);  // Save result.
 543             tag = bi.getRuleStatus();
 544             td.fActualTags.addElement(tag, status);
 545             lastP = p;
 546         }
 547     }
 548     // The loop normally exits by means of the break in the middle.
 549     // Make sure that the index was at the correct position for the break iterator to have
 550     //   returned DONE.
 551     if (i != td.fDataToBreak.length()) {
 552         errln("testFollowing():  iterator returned DONE prematurely.");
 553     }
 554
 555     // Full check of all results.
 556     td.checkResults("testFollowing", this);
 557 }
 558
 559
 560
 561 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 562     UErrorCode  status = U_ZERO_ERROR;
 563     int32_t     p;
 564     int32_t     tag;
 565     int32_t     lastP  = 0x7ffffffe;
 566     int         i;
 567
 568     logln("testPreceding():");
 569     bi.setText(td.fDataToBreak);
 570     td.clearResults();
 571
 572     p = bi.last();
 573     td.fActualBreakPositions.addElement(p, status);
 574     tag = bi.getRuleStatus();
 575     td.fActualTags.addElement(tag, status);
 576
 577     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 578         p = bi.preceding(i);
 579         if (p != lastP) {
 580             if (p == RuleBasedBreakIterator::DONE) {
 581                 break;
 582             }
 583             // We've reached a new break position.  Save it.
 584             td.fActualBreakPositions.insertElementAt(p, 0, status);
 585             lastP = p;
 586             tag = bi.getRuleStatus();
 587             td.fActualTags.insertElementAt(tag, 0, status);
 588         }
 589     }
 590     // The loop normally exits by means of the break in the middle.
 591     // Make sure that the index was at the correct position for the break iterator to have
 592     //   returned DONE.
 593     if (i != 0) {
 594         errln("testPreceding():  iterator returned DONE prematurely.");
 595     }
 596
 597     // Full check of all results.
 598     td.checkResults("testPreceding", this);
 599 }
 600
 601
 602
 603 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 604     UErrorCode  status = U_ZERO_ERROR;
 605     int         i;
 606     int32_t     tag;
 607
 608     logln("testIsBoundary():");
 609     bi.setText(td.fDataToBreak);
 610     td.clearResults();
 611
 612     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 613         if (bi.isBoundary(i)) {
 614             td.fActualBreakPositions.addElement(i, status);  // Save result.
 615             tag = bi.getRuleStatus();
 616             td.fActualTags.addElement(tag, status);
 617         }
 618     }
 619     td.checkResults("testIsBoundary: ", this);
 620 }
 621
 622
 623
 624 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 625 {
 626     iterator.setText(td.fDataToBreak);
 627
 628     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 629     int32_t offset = iterator.first();
 630     int32_t testOffset;
 631     int32_t count = 0;
 632
 633     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 634
 635     if (*testIterator != iterator)
 636         errln("clone() or operator!= failed: two clones compared unequal");
 637
 638     do {
 639         testOffset = testIterator->first();
 640         testOffset = testIterator->next(count);
 641         if (offset != testOffset)
 642             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 643
 644         if (offset != RuleBasedBreakIterator::DONE) {
 645             count++;
 646             offset = iterator.next();
 647
 648             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 649                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 650                 if (count > 10000 || offset == -1) {
 651                     errln("operator== failed too many times. Stopping test.");
 652                     if (offset == -1) {
 653                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 654                     }
 655                     return;
 656                 }
 657             }
 658         }
 659     } while (offset != RuleBasedBreakIterator::DONE);
 660
 661     // now do it backwards...
 662     offset = iterator.last();
 663     count = 0;
 664
 665     do {
 666         testOffset = testIterator->last();
 667         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 668         if (offset != testOffset)
 669             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 670
 671         if (offset != RuleBasedBreakIterator::DONE) {
 672             count--;
 673             offset = iterator.previous();
 674         }
 675     } while (offset != RuleBasedBreakIterator::DONE);
 676
 677     delete testIterator;
 678 }
 679
 680
 681 //---------------------------------------------
 682 //
 683 //     other tests
 684 //
 685 //---------------------------------------------
 686 void RBBITest::TestEmptyString()
 687 {
 688     UnicodeString text = "";
 689     UErrorCode status = U_ZERO_ERROR;
 690
 691     BITestData x(status);
 692     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 693     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 694     if (U_FAILURE(status))
 695     {
 696         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 697         return;
 698     }
 699     generalIteratorTest(*bi, x);
 700     delete bi;
 701 }
 702
 703 void RBBITest::TestGetAvailableLocales()
 704 {
 705     int32_t locCount = 0;
 706     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 707
 708     if (locCount == 0)
 709         dataerrln("getAvailableLocales() returned an empty list!");
 710     // Just make sure that it's returning good memory.
 711     int32_t i;
 712     for (i = 0; i < locCount; ++i) {
 713         logln(locList[i].getName());
 714     }
 715 }
 716
 717 //Testing the BreakIterator::getDisplayName() function
 718 void RBBITest::TestGetDisplayName()
 719 {
 720     UnicodeString   result;
 721
 722     BreakIterator::getDisplayName(Locale::getUS(), result);
 723     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 724         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 725                 + result);
 726
 727     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 728     if (result != "French (France)")
 729         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 730                 + result);
 731 }
 732 /**
 733  * Test End Behaviour
 734  * @bug 4068137
 735  */
 736 void RBBITest::TestEndBehaviour()
 737 {
 738     UErrorCode status = U_ZERO_ERROR;
 739     UnicodeString testString("boo.");
 740     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 741     if (U_FAILURE(status))
 742     {
 743         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 744         return;
 745     }
 746     wb->setText(testString);
 747
 748     if (wb->first() != 0)
 749         errln("Didn't get break at beginning of string.");
 750     if (wb->next() != 3)
 751         errln("Didn't get break before period in \"boo.\"");
 752     if (wb->current() != 4 && wb->next() != 4)
 753         errln("Didn't get break at end of string.");
 754     delete wb;
 755 }
 756 /*
 757  * @bug 4153072
 758  */
 759 void RBBITest::TestBug4153072() {
 760     UErrorCode status = U_ZERO_ERROR;
 761     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 762     if (U_FAILURE(status))
 763     {
 764         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 765         return;
 766     }
 767     UnicodeString str("...Hello, World!...");
 768     int32_t begin = 3;
 769     int32_t end = str.length() - 3;
 770     UBool onBoundary;
 771
 772     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 773     iter->adoptText(textIterator);
 774     int index;
 775     // Note: with the switch to UText, there is no way to restrict the
 776     //       iteration range to begin at an index other than zero.
 777     //       String character iterators created with a non-zero bound are
 778     //         treated by RBBI as being empty.
 779     for (index = -1; index < begin + 1; ++index) {
 780         onBoundary = iter->isBoundary(index);
 781         if (index == 0?  !onBoundary : onBoundary) {
 782             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 783                             " and begin index = " + begin);
 784         }
 785     }
 786     delete iter;
 787 }
 788
 789
 790 //
 791 // Test for problem reported by Ashok Matoria on 9 July 2007
 792 //    One.<kSoftHyphen><kSpace>Two.
 793 //
 794 //    Sentence break at start (0) and then on calling next() it breaks at
 795 //   'T' of "Two". Now, at this point if I do next() and
 796 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 797 //
 798 void RBBITest::TestBug5775() {
 799     UErrorCode status = U_ZERO_ERROR;
 800     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 801     TEST_ASSERT_SUCCESS(status);
 802     if (U_FAILURE(status)) {
 803         return;
 804     }
 805 // Check for status first for better handling of no data errors.
 806     TEST_ASSERT(bi != NULL);
 807     if (bi == NULL) {
 808         return;
 809     }
 810
 811     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 812     //               01234      56789
 813     s = s.unescape();
 814     bi->setText(s);
 815     int pos = bi->next();
 816     TEST_ASSERT(pos == 6);
 817     pos = bi->next();
 818     TEST_ASSERT(pos == 10);
 819     pos = bi->previous();
 820     TEST_ASSERT(pos == 6);
 821     delete bi;
 822 }
 823
 824
 825
 826 //------------------------------------------------------------------------------
 827 //
 828 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 829 //
 830 //------------------------------------------------------------------------------
 831
 832 struct TestParams {
 833     BreakIterator   *bi;
 834     UnicodeString    dataToBreak;
 835     UVector32       *expectedBreaks;
 836     UVector32       *srcLine;
 837     UVector32       *srcCol;
 838 };
 839
 840 void RBBITest::executeTest(TestParams *t) {
 841     int32_t    bp;
 842     int32_t    prevBP;
 843     int32_t    i;
 844
 845     if (t->bi == NULL) {
 846         return;
 847     }
 848
 849     t->bi->setText(t->dataToBreak);
 850     //
 851     //  Run the iterator forward
 852     //
 853     prevBP = -1;
 854     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
 855         if (prevBP ==  bp) {
 856             // Fail for lack of forward progress.
 857             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
 858                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 859             break;
 860         }
 861
 862         // Check that there were we didn't miss an expected break between the last one
 863         //  and this one.
 864         for (i=prevBP+1; i<bp; i++) {
 865             if (t->expectedBreaks->elementAti(i) != 0) {
 866                 int expected[] = {0, i};
 867                 printStringBreaks(t->dataToBreak, expected, 2);
 868                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 869                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 870             }
 871         }
 872
 873         // Check that the break we did find was expected
 874         if (t->expectedBreaks->elementAti(bp) == 0) {
 875             int expected[] = {0, bp};
 876             printStringBreaks(t->dataToBreak, expected, 2);
 877             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 878                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 879         } else {
 880             // The break was expected.
 881             //   Check that the {nnn} tag value is correct.
 882             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 883             if (expectedTagVal == -1) {
 884                 expectedTagVal = 0;
 885             }
 886             int32_t line = t->srcLine->elementAti(bp);
 887             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 888             if (rs != expectedTagVal) {
 889                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
 890                       "          Actual, Expected status = %4d, %4d",
 891                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
 892             }
 893         }
 894
 895
 896         prevBP = bp;
 897     }
 898
 899     // Verify that there were no missed expected breaks after the last one found
 900     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
 901         if (t->expectedBreaks->elementAti(i) != 0) {
 902             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 903                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 904         }
 905     }
 906
 907     //
 908     //  Run the iterator backwards, verify that the same breaks are found.
 909     //
 910     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
 911     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
 912         if (prevBP ==  bp) {
 913             // Fail for lack of progress.
 914             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
 915                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 916             break;
 917         }
 918
 919         // Check that there were we didn't miss an expected break between the last one
 920         //  and this one.  (UVector returns zeros for index out of bounds.)
 921         for (i=prevBP-1; i>bp; i--) {
 922             if (t->expectedBreaks->elementAti(i) != 0) {
 923                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 924                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 925             }
 926         }
 927
 928         // Check that the break we did find was expected
 929         if (t->expectedBreaks->elementAti(bp) == 0) {
 930             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
 931                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
 932         } else {
 933             // The break was expected.
 934             //   Check that the {nnn} tag value is correct.
 935             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
 936             if (expectedTagVal == -1) {
 937                 expectedTagVal = 0;
 938             }
 939             int line = t->srcLine->elementAti(bp);
 940             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
 941             if (rs != expectedTagVal) {
 942                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
 943                       "          Actual, Expected status = %4d, %4d",
 944                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
 945             }
 946         }
 947
 948         prevBP = bp;
 949     }
 950
 951     // Verify that there were no missed breaks prior to the last one found
 952     for (i=prevBP-1; i>=0; i--) {
 953         if (t->expectedBreaks->elementAti(i) != 0) {
 954             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
 955                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
 956         }
 957     }
 958
 959     // Check isBoundary()
 960     for (i=0; i<t->expectedBreaks->size(); i++) {
 961         UBool boundaryExpected = (t->expectedBreaks->elementAti(i) != 0);
 962         UBool boundaryFound    = t->bi->isBoundary(i);
 963         if (boundaryExpected != boundaryFound) {
 964             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
 965                   "        Expected, Actual= %s, %s",
 966                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i),
 967                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
 968         }
 969     }
 970
 971     // Check following()
 972     for (i=0; i<t->expectedBreaks->size(); i++) {
 973         int32_t actualBreak = t->bi->following(i);
 974         int32_t expectedBreak = BreakIterator::DONE;
 975         for (int32_t j=i+1; j < t->expectedBreaks->size(); j++) {
 976             if (t->expectedBreaks->elementAti(j) != 0) {
 977                 expectedBreak = j;
 978                 break;
 979             }
 980         }
 981         if (expectedBreak != actualBreak) {
 982             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
 983                   "        Expected, Actual= %d, %d",
 984                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
 985         }
 986     }
 987
 988     // Check preceding()
 989     for (i=t->expectedBreaks->size(); i>=0; i--) {
 990         int32_t actualBreak = t->bi->preceding(i);
 991         int32_t expectedBreak = BreakIterator::DONE;
 992
 993         for (int32_t j=i-1; j >= 0; j--) {
 994             if (t->expectedBreaks->elementAti(j) != 0) {
 995                 expectedBreak = j;
 996                 break;
 997             }
 998         }
 999         if (expectedBreak != actualBreak) {
1000             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1001                   "        Expected, Actual= %d, %d",
1002                   i, t->srcLine->elementAti(i), t->srcCol->elementAti(i), expectedBreak, actualBreak);
1003         }
1004     }
1005 }
1006
1007
1008 void RBBITest::TestExtended() {
1009 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1010     UErrorCode      status  = U_ZERO_ERROR;
1011     Locale          locale("");
1012
1013     UnicodeString       rules;
1014     TestParams          tp;
1015     tp.bi             = NULL;
1016     tp.expectedBreaks = new UVector32(status);
1017     tp.srcLine        = new UVector32(status);
1018     tp.srcCol         = new UVector32(status);
1019
1020     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
1021     if (U_FAILURE(status)) {
1022         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1023     }
1024
1025
1026     //
1027     //  Open and read the test data file.
1028     //
1029     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1030     char testFileName[1000];
1031     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1032         errln("Can't open test data.  Path too long.");
1033         return;
1034     }
1035     strcpy(testFileName, testDataDirectory);
1036     strcat(testFileName, "rbbitst.txt");
1037
1038     int    len;
1039     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1040     if (U_FAILURE(status)) {
1041         return; /* something went wrong, error already output */
1042     }
1043
1044
1045
1046
1047     //
1048     //  Put the test data into a UnicodeString
1049     //
1050     UnicodeString testString(FALSE, testFile, len);
1051
1052     enum EParseState{
1053         PARSE_COMMENT,
1054         PARSE_TAG,
1055         PARSE_DATA,
1056         PARSE_NUM
1057     }
1058     parseState = PARSE_TAG;
1059
1060     EParseState savedState = PARSE_TAG;
1061
1062     static const UChar CH_LF        = 0x0a;
1063     static const UChar CH_CR        = 0x0d;
1064     static const UChar CH_HASH      = 0x23;
1065     /*static const UChar CH_PERIOD    = 0x2e;*/
1066     static const UChar CH_LT        = 0x3c;
1067     static const UChar CH_GT        = 0x3e;
1068     static const UChar CH_BACKSLASH = 0x5c;
1069     static const UChar CH_BULLET    = 0x2022;
1070
1071     int32_t    lineNum  = 1;
1072     int32_t    colStart = 0;
1073     int32_t    column   = 0;
1074     int32_t    charIdx  = 0;
1075
1076     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1077
1078     for (charIdx = 0; charIdx < len; ) {
1079         status = U_ZERO_ERROR;
1080         UChar  c = testString.charAt(charIdx);
1081         charIdx++;
1082         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1083             // treat CRLF as a unit
1084             c = CH_LF;
1085             charIdx++;
1086         }
1087         if (c == CH_LF || c == CH_CR) {
1088             lineNum++;
1089             colStart = charIdx;
1090         }
1091         column = charIdx - colStart + 1;
1092
1093         switch (parseState) {
1094         case PARSE_COMMENT:
1095             if (c == 0x0a || c == 0x0d) {
1096                 parseState = savedState;
1097             }
1098             break;
1099
1100         case PARSE_TAG:
1101             {
1102             if (c == CH_HASH) {
1103                 parseState = PARSE_COMMENT;
1104                 savedState = PARSE_TAG;
1105                 break;
1106             }
1107             if (u_isUWhiteSpace(c)) {
1108                 break;
1109             }
1110             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1111                 delete tp.bi;
1112                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1113                 charIdx += 5;
1114                 break;
1115             }
1116             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1117                 delete tp.bi;
1118                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1119                 charIdx += 5;
1120                 break;
1121             }
1122             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1123                 delete tp.bi;
1124                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1125                 charIdx += 5;
1126                 break;
1127             }
1128             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1129                 delete tp.bi;
1130                 tp.bi = NULL;
1131                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1132                 charIdx += 5;
1133                 break;
1134             }
1135             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1136                 delete tp.bi;
1137                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1138                 charIdx += 6;
1139                 break;
1140             }
1141
1142             // <locale  loc_name>
1143             localeMatcher.reset(testString);
1144             if (localeMatcher.lookingAt(charIdx-1, status)) {
1145                 UnicodeString localeName = localeMatcher.group(1, status);
1146                 char localeName8[100];
1147                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1148                 locale = Locale::createFromName(localeName8);
1149                 charIdx += localeMatcher.group(0, status).length() - 1;
1150                 TEST_ASSERT_SUCCESS(status);
1151                 break;
1152             }
1153             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1154                 parseState = PARSE_DATA;
1155                 charIdx += 5;
1156                 tp.dataToBreak = "";
1157                 tp.expectedBreaks->removeAllElements();
1158                 tp.srcCol ->removeAllElements();
1159                 tp.srcLine->removeAllElements();
1160                 break;
1161             }
1162
1163             errln("line %d: Tag expected in test file.", lineNum);
1164             parseState = PARSE_COMMENT;
1165             savedState = PARSE_DATA;
1166             goto end_test; // Stop the test.
1167             }
1168             break;
1169
1170         case PARSE_DATA:
1171             if (c == CH_BULLET) {
1172                 int32_t  breakIdx = tp.dataToBreak.length();
1173                 tp.expectedBreaks->setSize(breakIdx+1);
1174                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1175                 tp.srcLine->setSize(breakIdx+1);
1176                 tp.srcLine->setElementAt(lineNum, breakIdx);
1177                 tp.srcCol ->setSize(breakIdx+1);
1178                 tp.srcCol ->setElementAt(column, breakIdx);
1179                 break;
1180             }
1181
1182             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1183                 // Add final entry to mappings from break location to source file position.
1184                 //  Need one extra because last break position returned is after the
1185                 //    last char in the data, not at the last char.
1186                 tp.srcLine->addElement(lineNum, status);
1187                 tp.srcCol ->addElement(column, status);
1188
1189                 parseState = PARSE_TAG;
1190                 charIdx += 6;
1191
1192                 // RUN THE TEST!
1193                 executeTest(&tp);
1194                 break;
1195             }
1196
1197             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1198                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1199                 // Get the code point from the name and insert it into the test data.
1200                 //   (Damn, no API takes names in Unicode  !!!
1201                 //    we've got to take it back to char *)
1202                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1203                 int32_t nameLength = nameEndIdx - (charIdx+2);
1204                 char charNameBuf[200];
1205                 UChar32 theChar = -1;
1206                 if (nameEndIdx != -1) {
1207                     UErrorCode status = U_ZERO_ERROR;
1208                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1209                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1210                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1211                     if (U_FAILURE(status)) {
1212                         theChar = -1;
1213                     }
1214                 }
1215                 if (theChar == -1) {
1216                     errln("Error in named character in test file at line %d, col %d",
1217                         lineNum, column);
1218                 } else {
1219                     // Named code point was recognized.  Insert it
1220                     //   into the test data.
1221                     tp.dataToBreak.append(theChar);
1222                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1223                         tp.srcLine->addElement(lineNum, status);
1224                         tp.srcCol ->addElement(column, status);
1225                     }
1226                 }
1227                 if (nameEndIdx > charIdx) {
1228                     charIdx = nameEndIdx+1;
1229
1230                 }
1231                 break;
1232             }
1233
1234
1235
1236
1237             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1238                 charIdx++;
1239                 int32_t  breakIdx = tp.dataToBreak.length();
1240                 tp.expectedBreaks->setSize(breakIdx+1);
1241                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1242                 tp.srcLine->setSize(breakIdx+1);
1243                 tp.srcLine->setElementAt(lineNum, breakIdx);
1244                 tp.srcCol ->setSize(breakIdx+1);
1245                 tp.srcCol ->setElementAt(column, breakIdx);
1246                 break;
1247             }
1248
1249             if (c == CH_LT) {
1250                 tagValue   = 0;
1251                 parseState = PARSE_NUM;
1252                 break;
1253             }
1254
1255             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1256                 parseState = PARSE_COMMENT;
1257                 savedState = PARSE_DATA;
1258                 break;
1259             }
1260
1261             if (c == CH_BACKSLASH) {
1262                 // Check for \ at end of line, a line continuation.
1263                 //     Advance over (discard) the newline
1264                 UChar32 cp = testString.char32At(charIdx);
1265                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1266                     // We have a CR LF
1267                     //  Need an extra increment of the input ptr to move over both of them
1268                     charIdx++;
1269                 }
1270                 if (cp == CH_LF || cp == CH_CR) {
1271                     lineNum++;
1272                     colStart = charIdx;
1273                     charIdx++;
1274                     break;
1275                 }
1276
1277                 // Let unescape handle the back slash.
1278                 cp = testString.unescapeAt(charIdx);
1279                 if (cp != -1) {
1280                     // Escape sequence was recognized.  Insert the char
1281                     //   into the test data.
1282                     tp.dataToBreak.append(cp);
1283                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1284                         tp.srcLine->addElement(lineNum, status);
1285                         tp.srcCol ->addElement(column, status);
1286                     }
1287                     break;
1288                 }
1289
1290
1291                 // Not a recognized backslash escape sequence.
1292                 // Take the next char as a literal.
1293                 //  TODO:  Should this be an error?
1294                 c = testString.charAt(charIdx);
1295                 charIdx = testString.moveIndex32(charIdx, 1);
1296             }
1297
1298             // Normal, non-escaped data char.
1299             tp.dataToBreak.append(c);
1300
1301             // Save the mapping from offset in the data to line/column numbers in
1302             //   the original input file.  Will be used for better error messages only.
1303             //   If there's an expected break before this char, the slot in the mapping
1304             //     vector will already be set for this char; don't overwrite it.
1305             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1306                 tp.srcLine->addElement(lineNum, status);
1307                 tp.srcCol ->addElement(column, status);
1308             }
1309             break;
1310
1311
1312         case PARSE_NUM:
1313             // We are parsing an expected numeric tag value, like <1234>,
1314             //   within a chunk of data.
1315             if (u_isUWhiteSpace(c)) {
1316                 break;
1317             }
1318
1319             if (c == CH_GT) {
1320                 // Finished the number.  Add the info to the expected break data,
1321                 //   and switch parse state back to doing plain data.
1322                 parseState = PARSE_DATA;
1323                 if (tagValue == 0) {
1324                     tagValue = -1;
1325                 }
1326                 int32_t  breakIdx = tp.dataToBreak.length();
1327                 tp.expectedBreaks->setSize(breakIdx+1);
1328                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1329                 tp.srcLine->setSize(breakIdx+1);
1330                 tp.srcLine->setElementAt(lineNum, breakIdx);
1331                 tp.srcCol ->setSize(breakIdx+1);
1332                 tp.srcCol ->setElementAt(column, breakIdx);
1333                 break;
1334             }
1335
1336             if (u_isdigit(c)) {
1337                 tagValue = tagValue*10 + u_charDigitValue(c);
1338                 break;
1339             }
1340
1341             errln("Syntax Error in test file at line %d, col %d",
1342                 lineNum, column);
1343             parseState = PARSE_COMMENT;
1344             goto end_test; // Stop the test
1345             break;
1346         }
1347
1348
1349         if (U_FAILURE(status)) {
1350             dataerrln("ICU Error %s while parsing test file at line %d.",
1351                 u_errorName(status), lineNum);
1352             status = U_ZERO_ERROR;
1353             goto end_test; // Stop the test
1354         }
1355
1356     }
1357
1358 end_test:
1359     delete tp.bi;
1360     delete tp.expectedBreaks;
1361     delete tp.srcLine;
1362     delete tp.srcCol;
1363     delete [] testFile;
1364 #endif
1365 }
1366
1367
1368 //-------------------------------------------------------------------------------
1369 //
1370 //  TestDictRules   create a break iterator from source rules that includes a
1371 //                  dictionary range.   Regression for bug #7130.  Source rules
1372 //                  do not declare a break iterator type (word, line, sentence, etc.
1373 //                  but the dictionary code, without a type, would loop.
1374 //
1375 //-------------------------------------------------------------------------------
1376 void RBBITest::TestDictRules() {
1377     const char *rules =  "$dictionary = [a-z]; \n"
1378                          "!!forward; \n"
1379                          "$dictionary $dictionary; \n"
1380                          "!!reverse; \n"
1381                          "$dictionary $dictionary; \n";
1382     const char *text = "aa";
1383     UErrorCode status = U_ZERO_ERROR;
1384     UParseError parseError;
1385
1386     RuleBasedBreakIterator bi(rules, parseError, status);
1387     if (U_SUCCESS(status)) {
1388         UnicodeString utext = text;
1389         bi.setText(utext);
1390         int32_t position;
1391         int32_t loops;
1392         for (loops = 0; loops<10; loops++) {
1393             position = bi.next();
1394             if (position == RuleBasedBreakIterator::DONE) {
1395                 break;
1396             }
1397         }
1398         TEST_ASSERT(loops == 1);
1399     } else {
1400         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1401     }
1402 }
1403
1404
1405
1406 //-------------------------------------------------------------------------------
1407 //
1408 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1409 //    return the datain one big UChar * buffer, which the caller must delete.
1410 //
1411 //    parameters:
1412 //          fileName:   the name of the file, with no directory part.  The test data directory
1413 //                      is assumed.
1414 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1415 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1416 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1417 //                      Pass NULL for the system default encoding.
1418 //          status
1419 //    returns:
1420 //                      The file data, converted to UChar.
1421 //                      The caller must delete this when done with
1422 //                           delete [] theBuffer;
1423 //
1424 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1425 //           Move this function to some common place.
1426 //
1427 //--------------------------------------------------------------------------------
1428 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1429     UChar       *retPtr  = NULL;
1430     char        *fileBuf = NULL;
1431     UConverter* conv     = NULL;
1432     FILE        *f       = NULL;
1433
1434     ulen = 0;
1435     if (U_FAILURE(status)) {
1436         return retPtr;
1437     }
1438
1439     //
1440     //  Open the file.
1441     //
1442     f = fopen(fileName, "rb");
1443     if (f == 0) {
1444         dataerrln("Error opening test data file %s\n", fileName);
1445         status = U_FILE_ACCESS_ERROR;
1446         return NULL;
1447     }
1448     //
1449     //  Read it in
1450     //
1451     int   fileSize;
1452     int   amt_read;
1453
1454     fseek( f, 0, SEEK_END);
1455     fileSize = ftell(f);
1456     fileBuf = new char[fileSize];
1457     fseek(f, 0, SEEK_SET);
1458     amt_read = fread(fileBuf, 1, fileSize, f);
1459     if (amt_read != fileSize || fileSize <= 0) {
1460         errln("Error reading test data file.");
1461         goto cleanUpAndReturn;
1462     }
1463
1464     //
1465     // Look for a Unicode Signature (BOM) on the data just read
1466     //
1467     int32_t        signatureLength;
1468     const char *   fileBufC;
1469     const char*    bomEncoding;
1470
1471     fileBufC = fileBuf;
1472     bomEncoding = ucnv_detectUnicodeSignature(
1473         fileBuf, fileSize, &signatureLength, &status);
1474     if(bomEncoding!=NULL ){
1475         fileBufC  += signatureLength;
1476         fileSize  -= signatureLength;
1477         encoding = bomEncoding;
1478     }
1479
1480     //
1481     // Open a converter to take the rule file to UTF-16
1482     //
1483     conv = ucnv_open(encoding, &status);
1484     if (U_FAILURE(status)) {
1485         goto cleanUpAndReturn;
1486     }
1487
1488     //
1489     // Convert the rules to UChar.
1490     //  Preflight first to determine required buffer size.
1491     //
1492     ulen = ucnv_toUChars(conv,
1493         NULL,           //  dest,
1494         0,              //  destCapacity,
1495         fileBufC,
1496         fileSize,
1497         &status);
1498     if (status == U_BUFFER_OVERFLOW_ERROR) {
1499         // Buffer Overflow is expected from the preflight operation.
1500         status = U_ZERO_ERROR;
1501
1502         retPtr = new UChar[ulen+1];
1503         ucnv_toUChars(conv,
1504             retPtr,       //  dest,
1505             ulen+1,
1506             fileBufC,
1507             fileSize,
1508             &status);
1509     }
1510
1511 cleanUpAndReturn:
1512     fclose(f);
1513     delete []fileBuf;
1514     ucnv_close(conv);
1515     if (U_FAILURE(status)) {
1516         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1517         delete []retPtr;
1518         retPtr = 0;
1519         ulen   = 0;
1520     };
1521     return retPtr;
1522 }
1523
1524
1525
1526 //--------------------------------------------------------------------------------------------
1527 //
1528 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1529 //
1530 //-------------------------------------------------------------------------------------------
1531 void RBBITest::TestUnicodeFiles() {
1532     RuleBasedBreakIterator  *bi;
1533     UErrorCode               status = U_ZERO_ERROR;
1534
1535     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1536     TEST_ASSERT_SUCCESS(status);
1537     if (U_SUCCESS(status)) {
1538         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1539     }
1540     delete bi;
1541
1542     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1543     TEST_ASSERT_SUCCESS(status);
1544     if (U_SUCCESS(status)) {
1545         runUnicodeTestData("WordBreakTest.txt", bi);
1546     }
1547     delete bi;
1548
1549     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1550     TEST_ASSERT_SUCCESS(status);
1551     if (U_SUCCESS(status)) {
1552         runUnicodeTestData("SentenceBreakTest.txt", bi);
1553     }
1554     delete bi;
1555
1556     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1557     TEST_ASSERT_SUCCESS(status);
1558     if (U_SUCCESS(status)) {
1559         runUnicodeTestData("LineBreakTest.txt", bi);
1560     }
1561     delete bi;
1562 }
1563
1564
1565 //--------------------------------------------------------------------------------------------
1566 //
1567 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1568 //
1569 //-------------------------------------------------------------------------------------------
1570 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1571 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1572     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1573     UBool isTicket7270Fixed = !logKnownIssue("7270");
1574     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1575     UErrorCode  status = U_ZERO_ERROR;
1576
1577     //
1578     //  Open and read the test data file, put it into a UnicodeString.
1579     //
1580     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1581     char testFileName[1000];
1582     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1583         dataerrln("Can't open test data.  Path too long.");
1584         return;
1585     }
1586     strcpy(testFileName, testDataDirectory);
1587     strcat(testFileName, fileName);
1588
1589     logln("Opening data file %s\n", fileName);
1590
1591     int    len;
1592     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1593     if (status != U_FILE_ACCESS_ERROR) {
1594         TEST_ASSERT_SUCCESS(status);
1595         TEST_ASSERT(testFile != NULL);
1596     }
1597     if (U_FAILURE(status) || testFile == NULL) {
1598         return; /* something went wrong, error already output */
1599     }
1600     UnicodeString testFileAsString(TRUE, testFile, len);
1601
1602     //
1603     //  Parse the test data file using a regular expression.
1604     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1605     //     is identified by which group had a match.
1606     //
1607     //    Caputure Group #                  1          2            3            4           5
1608     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1609     //
1610     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1611     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1612     UnicodeString   testString;
1613     UVector32       breakPositions(status);
1614     int             lineNumber = 1;
1615     TEST_ASSERT_SUCCESS(status);
1616     if (U_FAILURE(status)) {
1617         return;
1618     }
1619
1620     //
1621     //  Scan through each test case, building up the string to be broken in testString,
1622     //   and the positions that should be boundaries in the breakPositions vector.
1623     //
1624     int spin = 0;
1625     while (tokenMatcher.find()) {
1626         if(tokenMatcher.hitEnd()) {
1627           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1628              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1629              and caused an infinite loop here on EBCDIC systems!
1630           */
1631           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1632           //       return;
1633         }
1634         if (tokenMatcher.start(1, status) >= 0) {
1635             // Scanned a divide sign, indicating a break position in the test data.
1636             if (testString.length()>0) {
1637                 breakPositions.addElement(testString.length(), status);
1638             }
1639         }
1640         else if (tokenMatcher.start(2, status) >= 0) {
1641             // Scanned an 'x', meaning no break at this position in the test data
1642             //   Nothing to be done here.
1643             }
1644         else if (tokenMatcher.start(3, status) >= 0) {
1645             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1646             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1647             int length = hexNumber.length();
1648             if (length<=8) {
1649                 char buf[10];
1650                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1651                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1652                 if (c<=0x10ffff) {
1653                     testString.append(c);
1654                 } else {
1655                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1656                        fileName, lineNumber);
1657                 }
1658             } else {
1659                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1660                        fileName, lineNumber);
1661              }
1662         }
1663         else if (tokenMatcher.start(4, status) >= 0) {
1664             // Scanned to end of a line, possibly skipping over a comment in the process.
1665             //   If the line from the file contained test data, run the test now.
1666             //
1667             if (testString.length() > 0) {
1668 // TODO(andy): Remove this time bomb code. Note: Failing line numbers may change when updating to new Unicode data.
1669 //             Rule 8
1670 //                ZW SP* <break>
1671 //             is not yet implemented.
1672 if (!(isLineBreak && !isTicket7270Fixed && (5198 == lineNumber ||
1673                                             5202 == lineNumber ||
1674                                             5214 == lineNumber ||
1675                                             5246 == lineNumber ||
1676                                             5298 == lineNumber ||
1677                                             5302 == lineNumber ))) {
1678                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1679 }
1680             }
1681
1682             // Clear out this test case.
1683             //    The string and breakPositions vector will be refilled as the next
1684             //       test case is parsed.
1685             testString.remove();
1686             breakPositions.removeAllElements();
1687             lineNumber++;
1688         } else {
1689             // Scanner catchall.  Something unrecognized appeared on the line.
1690             char token[16];
1691             UnicodeString uToken = tokenMatcher.group(0, status);
1692             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1693             token[sizeof(token)-1] = 0;
1694             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1695
1696             // Clean up, in preparation for continuing with the next line.
1697             testString.remove();
1698             breakPositions.removeAllElements();
1699             lineNumber++;
1700         }
1701         TEST_ASSERT_SUCCESS(status);
1702         if (U_FAILURE(status)) {
1703             break;
1704         }
1705     }
1706
1707     delete [] testFile;
1708  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1709 }
1710
1711 //--------------------------------------------------------------------------------------------
1712 //
1713 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1714 //                            test data files.  Do only a simple, forward-only check -
1715 //                            this test is mostly to check that ICU and the Unicode
1716 //                            data agree with each other.
1717 //
1718 //--------------------------------------------------------------------------------------------
1719 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1720                          const UnicodeString &testString,   // Text data to be broken
1721                          UVector32 *breakPositions,         // Positions where breaks should be found.
1722                          RuleBasedBreakIterator *bi) {
1723     int32_t pos;                 // Break Position in the test string
1724     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1725     int32_t expectedPos;         // Expected break position (index into test string)
1726
1727     bi->setText(testString);
1728     pos = bi->first();
1729     pos = bi->next();
1730
1731     while (pos != BreakIterator::DONE) {
1732         if (expectedI >= breakPositions->size()) {
1733             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1734                 testFileName, lineNumber, pos);
1735             break;
1736         }
1737         expectedPos = breakPositions->elementAti(expectedI);
1738         if (pos < expectedPos) {
1739             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1740                 testFileName, lineNumber, pos);
1741             break;
1742         }
1743         if (pos > expectedPos) {
1744             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1745                 testFileName, lineNumber, expectedPos);
1746             break;
1747         }
1748         pos = bi->next();
1749         expectedI++;
1750     }
1751
1752     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1753         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1754             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1755     }
1756 }
1757
1758
1759
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761 //---------------------------------------------------------------------------------------
1762 //
1763 //   classs RBBIMonkeyKind
1764 //
1765 //      Monkey Test for Break Iteration
1766 //      Abstract interface class.   Concrete derived classes independently
1767 //      implement the break rules for different iterator types.
1768 //
1769 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1770 //      testing, but works purely in terms of the interface defined here.
1771 //
1772 //---------------------------------------------------------------------------------------
1773 class RBBIMonkeyKind {
1774 public:
1775     // Return a UVector of UnicodeSets, representing the character classes used
1776     //   for this type of iterator.
1777     virtual  UVector  *charClasses() = 0;
1778
1779     // Set the test text on which subsequent calls to next() will operate
1780     virtual  void      setText(const UnicodeString &s) = 0;
1781
1782     // Find the next break postion, starting from the prev break position, or from zero.
1783     // Return -1 after reaching end of string.
1784     virtual  int32_t   next(int32_t i) = 0;
1785
1786     virtual ~RBBIMonkeyKind();
1787     UErrorCode       deferredStatus;
1788
1789
1790 protected:
1791     RBBIMonkeyKind();
1792
1793 private:
1794 };
1795
1796 RBBIMonkeyKind::RBBIMonkeyKind() {
1797     deferredStatus = U_ZERO_ERROR;
1798 }
1799
1800 RBBIMonkeyKind::~RBBIMonkeyKind() {
1801 }
1802
1803
1804 //----------------------------------------------------------------------------------------
1805 //
1806 //   Random Numbers.  Similar to standard lib rand() and srand()
1807 //                    Not using library to
1808 //                      1.  Get same results on all platforms.
1809 //                      2.  Get access to current seed, to more easily reproduce failures.
1810 //
1811 //---------------------------------------------------------------------------------------
1812 static uint32_t m_seed = 1;
1813
1814 static uint32_t m_rand()
1815 {
1816     m_seed = m_seed * 1103515245 + 12345;
1817     return (uint32_t)(m_seed/65536) % 32768;
1818 }
1819
1820
1821 //------------------------------------------------------------------------------------------
1822 //
1823 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1824 //                             of RBBIMonkeyKind.
1825 //
1826 //------------------------------------------------------------------------------------------
1827 class RBBICharMonkey: public RBBIMonkeyKind {
1828 public:
1829     RBBICharMonkey();
1830     virtual          ~RBBICharMonkey();
1831     virtual  UVector *charClasses();
1832     virtual  void     setText(const UnicodeString &s);
1833     virtual  int32_t  next(int32_t i);
1834 private:
1835     UVector   *fSets;
1836
1837     UnicodeSet  *fCRLFSet;
1838     UnicodeSet  *fControlSet;
1839     UnicodeSet  *fExtendSet;
1840     UnicodeSet  *fRegionalIndicatorSet;
1841     UnicodeSet  *fPrependSet;
1842     UnicodeSet  *fSpacingSet;
1843     UnicodeSet  *fLSet;
1844     UnicodeSet  *fVSet;
1845     UnicodeSet  *fTSet;
1846     UnicodeSet  *fLVSet;
1847     UnicodeSet  *fLVTSet;
1848     UnicodeSet  *fHangulSet;
1849     UnicodeSet  *fAnySet;
1850
1851     const UnicodeString *fText;
1852 };
1853
1854
1855 RBBICharMonkey::RBBICharMonkey() {
1856     UErrorCode  status = U_ZERO_ERROR;
1857
1858     fText = NULL;
1859
1860     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
1861     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
1862     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
1863     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
1864     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
1865     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
1866     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
1867     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
1868     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
1869     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
1870     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
1871     fHangulSet  = new UnicodeSet();
1872     fHangulSet->addAll(*fLSet);
1873     fHangulSet->addAll(*fVSet);
1874     fHangulSet->addAll(*fTSet);
1875     fHangulSet->addAll(*fLVSet);
1876     fHangulSet->addAll(*fLVTSet);
1877     fAnySet     = new UnicodeSet(0, 0x10ffff);
1878
1879     fSets       = new UVector(status);
1880     fSets->addElement(fCRLFSet,    status);
1881     fSets->addElement(fControlSet, status);
1882     fSets->addElement(fExtendSet,  status);
1883     fSets->addElement(fRegionalIndicatorSet, status);
1884     if (!fPrependSet->isEmpty()) {
1885         fSets->addElement(fPrependSet, status);
1886     }
1887     fSets->addElement(fSpacingSet, status);
1888     fSets->addElement(fHangulSet,  status);
1889     fSets->addElement(fAnySet,     status);
1890     if (U_FAILURE(status)) {
1891         deferredStatus = status;
1892     }
1893 }
1894
1895
1896 void RBBICharMonkey::setText(const UnicodeString &s) {
1897     fText = &s;
1898 }
1899
1900
1901
1902 int32_t RBBICharMonkey::next(int32_t prevPos) {
1903     int    p0, p1, p2, p3;    // Indices of the significant code points around the
1904                               //   break position being tested.  The candidate break
1905                               //   location is before p2.
1906
1907     int     breakPos = -1;
1908
1909     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
1910
1911     if (U_FAILURE(deferredStatus)) {
1912         return -1;
1913     }
1914
1915     // Previous break at end of string.  return DONE.
1916     if (prevPos >= fText->length()) {
1917         return -1;
1918     }
1919     p0 = p1 = p2 = p3 = prevPos;
1920     c3 =  fText->char32At(prevPos);
1921     c0 = c1 = c2 = 0;
1922     (void)p0;   // suppress set but not used warning.
1923     (void)c0;
1924
1925     // Loop runs once per "significant" character position in the input text.
1926     for (;;) {
1927         // Move all of the positions forward in the input string.
1928         p0 = p1;  c0 = c1;
1929         p1 = p2;  c1 = c2;
1930         p2 = p3;  c2 = c3;
1931
1932         // Advancd p3 by one codepoint
1933         p3 = fText->moveIndex32(p3, 1);
1934         c3 = fText->char32At(p3);
1935
1936         if (p1 == p2) {
1937             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1938             continue;
1939         }
1940         if (p2 == fText->length()) {
1941             // Reached end of string.  Always a break position.
1942             break;
1943         }
1944
1945         // Rule  GB3   CR x LF
1946         //     No Extend or Format characters may appear between the CR and LF,
1947         //     which requires the additional check for p2 immediately following p1.
1948         //
1949         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
1950             continue;
1951         }
1952
1953         // Rule (GB4).   ( Control | CR | LF ) <break>
1954         if (fControlSet->contains(c1) ||
1955             c1 == 0x0D ||
1956             c1 == 0x0A)  {
1957             break;
1958         }
1959
1960         // Rule (GB5)    <break>  ( Control | CR | LF )
1961         //
1962         if (fControlSet->contains(c2) ||
1963             c2 == 0x0D ||
1964             c2 == 0x0A)  {
1965             break;
1966         }
1967
1968
1969         // Rule (GB6)  L x ( L | V | LV | LVT )
1970         if (fLSet->contains(c1) &&
1971                (fLSet->contains(c2)  ||
1972                 fVSet->contains(c2)  ||
1973                 fLVSet->contains(c2) ||
1974                 fLVTSet->contains(c2))) {
1975             continue;
1976         }
1977
1978         // Rule (GB7)    ( LV | V )  x  ( V | T )
1979         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
1980             (fVSet->contains(c2) || fTSet->contains(c2)))  {
1981             continue;
1982         }
1983
1984         // Rule (GB8)    ( LVT | T)  x T
1985         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
1986             fTSet->contains(c2))  {
1987             continue;
1988         }
1989
1990         // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
1991
1992         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
1993         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
1994             continue;
1995         }
1996
1997         // Rule (GB9)    Numeric x ALetter
1998         if (fExtendSet->contains(c2))  {
1999             continue;
2000         }
2001
2002         // Rule (GB9a)   x  SpacingMark
2003         if (fSpacingSet->contains(c2)) {
2004             continue;
2005         }
2006
2007         // Rule (GB9b)   Prepend x
2008         if (fPrependSet->contains(c1)) {
2009             continue;
2010         }
2011
2012         // Rule (GB10)  Any  <break>  Any
2013         break;
2014     }
2015
2016     breakPos = p2;
2017     return breakPos;
2018 }
2019
2020
2021
2022 UVector  *RBBICharMonkey::charClasses() {
2023     return fSets;
2024 }
2025
2026
2027 RBBICharMonkey::~RBBICharMonkey() {
2028     delete fSets;
2029     delete fCRLFSet;
2030     delete fControlSet;
2031     delete fExtendSet;
2032     delete fRegionalIndicatorSet;
2033     delete fPrependSet;
2034     delete fSpacingSet;
2035     delete fLSet;
2036     delete fVSet;
2037     delete fTSet;
2038     delete fLVSet;
2039     delete fLVTSet;
2040     delete fHangulSet;
2041     delete fAnySet;
2042 }
2043
2044 //------------------------------------------------------------------------------------------
2045 //
2046 //   class RBBIWordMonkey      Word Break specific implementation
2047 //                             of RBBIMonkeyKind.
2048 //
2049 //------------------------------------------------------------------------------------------
2050 class RBBIWordMonkey: public RBBIMonkeyKind {
2051 public:
2052     RBBIWordMonkey();
2053     virtual          ~RBBIWordMonkey();
2054     virtual  UVector *charClasses();
2055     virtual  void     setText(const UnicodeString &s);
2056     virtual int32_t   next(int32_t i);
2057 private:
2058     UVector      *fSets;
2059
2060     UnicodeSet  *fCRSet;
2061     UnicodeSet  *fLFSet;
2062     UnicodeSet  *fNewlineSet;
2063     UnicodeSet  *fRegionalIndicatorSet;
2064     UnicodeSet  *fKatakanaSet;
2065     UnicodeSet  *fHebrew_LetterSet;
2066     UnicodeSet  *fALetterSet;
2067     // TODO(jungshik): Do we still need this change?
2068     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2069     UnicodeSet  *fSingle_QuoteSet;
2070     UnicodeSet  *fDouble_QuoteSet;
2071     UnicodeSet  *fMidNumLetSet;
2072     UnicodeSet  *fMidLetterSet;
2073     UnicodeSet  *fMidNumSet;
2074     UnicodeSet  *fNumericSet;
2075     UnicodeSet  *fFormatSet;
2076     UnicodeSet  *fOtherSet;
2077     UnicodeSet  *fExtendSet;
2078     UnicodeSet  *fExtendNumLetSet;
2079     UnicodeSet  *fDictionaryCjkSet;
2080
2081     const UnicodeString  *fText;
2082 };
2083
2084
2085 RBBIWordMonkey::RBBIWordMonkey()
2086 {
2087     UErrorCode  status = U_ZERO_ERROR;
2088
2089     fSets            = new UVector(status);
2090
2091     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2092     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2093     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2094     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2095     // Exclude Hangul syllables from ALetterSet during testing.
2096     // Leave CJK dictionary characters out from the monkey tests!
2097 #if 0
2098     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2099                                       "[\\p{Line_Break = Complex_Context}"
2100                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2101                                       "-\\p{Grapheme_Cluster_Break = Control}"
2102                                       "]]",
2103                                       status);
2104 #endif
2105     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2106     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2107     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2108     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2109     fALetterSet->removeAll(*fDictionaryCjkSet);
2110     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2111     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2112     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2113     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2114     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2115     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2116     // we should figure out why
2117     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2118     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2119     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2120     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2121
2122     fOtherSet        = new UnicodeSet();
2123     if(U_FAILURE(status)) {
2124       deferredStatus = status;
2125       return;
2126     }
2127
2128     fOtherSet->complement();
2129     fOtherSet->removeAll(*fCRSet);
2130     fOtherSet->removeAll(*fLFSet);
2131     fOtherSet->removeAll(*fNewlineSet);
2132     fOtherSet->removeAll(*fKatakanaSet);
2133     fOtherSet->removeAll(*fHebrew_LetterSet);
2134     fOtherSet->removeAll(*fALetterSet);
2135     fOtherSet->removeAll(*fSingle_QuoteSet);
2136     fOtherSet->removeAll(*fDouble_QuoteSet);
2137     fOtherSet->removeAll(*fMidLetterSet);
2138     fOtherSet->removeAll(*fMidNumSet);
2139     fOtherSet->removeAll(*fNumericSet);
2140     fOtherSet->removeAll(*fExtendNumLetSet);
2141     fOtherSet->removeAll(*fFormatSet);
2142     fOtherSet->removeAll(*fExtendSet);
2143     fOtherSet->removeAll(*fRegionalIndicatorSet);
2144     // Inhibit dictionary characters from being tested at all.
2145     fOtherSet->removeAll(*fDictionaryCjkSet);
2146     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2147
2148     fSets->addElement(fCRSet,                status);
2149     fSets->addElement(fLFSet,                status);
2150     fSets->addElement(fNewlineSet,           status);
2151     fSets->addElement(fRegionalIndicatorSet, status);
2152     fSets->addElement(fHebrew_LetterSet,     status);
2153     fSets->addElement(fALetterSet,           status);
2154     fSets->addElement(fSingle_QuoteSet,      status);
2155     fSets->addElement(fDouble_QuoteSet,      status);
2156     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2157     fSets->addElement(fMidLetterSet,         status);
2158     fSets->addElement(fMidNumLetSet,         status);
2159     fSets->addElement(fMidNumSet,            status);
2160     fSets->addElement(fNumericSet,           status);
2161     fSets->addElement(fFormatSet,            status);
2162     fSets->addElement(fExtendSet,            status);
2163     fSets->addElement(fOtherSet,             status);
2164     fSets->addElement(fExtendNumLetSet,      status);
2165
2166     if (U_FAILURE(status)) {
2167         deferredStatus = status;
2168     }
2169 }
2170
2171 void RBBIWordMonkey::setText(const UnicodeString &s) {
2172     fText       = &s;
2173 }
2174
2175
2176 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2177     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2178                               //   break position being tested.  The candidate break
2179                               //   location is before p2.
2180
2181     int     breakPos = -1;
2182
2183     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2184
2185     if (U_FAILURE(deferredStatus)) {
2186         return -1;
2187     }
2188
2189     // Prev break at end of string.  return DONE.
2190     if (prevPos >= fText->length()) {
2191         return -1;
2192     }
2193     p0 = p1 = p2 = p3 = prevPos;
2194     c3 =  fText->char32At(prevPos);
2195     c0 = c1 = c2 = 0;
2196     (void)p0;       // Suppress set but not used warning.
2197
2198     // Loop runs once per "significant" character position in the input text.
2199     for (;;) {
2200         // Move all of the positions forward in the input string.
2201         p0 = p1;  c0 = c1;
2202         p1 = p2;  c1 = c2;
2203         p2 = p3;  c2 = c3;
2204
2205         // Advancd p3 by    X(Extend | Format)*   Rule 4
2206         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2207         do {
2208             p3 = fText->moveIndex32(p3, 1);
2209             c3 = fText->char32At(p3);
2210             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2211                break;
2212             };
2213         }
2214         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2215
2216
2217         if (p1 == p2) {
2218             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2219             continue;
2220         }
2221         if (p2 == fText->length()) {
2222             // Reached end of string.  Always a break position.
2223             break;
2224         }
2225
2226         // Rule  (3)   CR x LF
2227         //     No Extend or Format characters may appear between the CR and LF,
2228         //     which requires the additional check for p2 immediately following p1.
2229         //
2230         if (c1==0x0D && c2==0x0A) {
2231             continue;
2232         }
2233
2234         // Rule (3a)  Break before and after newlines (including CR and LF)
2235         //
2236         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2237             break;
2238         };
2239         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2240             break;
2241         };
2242
2243         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2244         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2245             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2246             continue;
2247         }
2248
2249         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2250         //
2251         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2252              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2253              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2254             continue;
2255         }
2256
2257         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2258         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2259             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2260             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2261             continue;
2262         }
2263
2264         // Rule (7a)     Hebrew_Letter x Single_Quote
2265         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2266             continue;
2267         }
2268
2269         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2270         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2271             continue;
2272         }
2273
2274         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2275         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2276             continue;
2277         }
2278
2279         // Rule (8)    Numeric x Numeric
2280         if (fNumericSet->contains(c1) &&
2281             fNumericSet->contains(c2))  {
2282             continue;
2283         }
2284
2285         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2286         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2287             fNumericSet->contains(c2))  {
2288             continue;
2289         }
2290
2291         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2292         if (fNumericSet->contains(c1) &&
2293             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2294             continue;
2295         }
2296
2297         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2298         if (fNumericSet->contains(c0) &&
2299             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2300             fNumericSet->contains(c2)) {
2301             continue;
2302         }
2303
2304         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2305         if (fNumericSet->contains(c1) &&
2306             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2307             fNumericSet->contains(c3)) {
2308             continue;
2309         }
2310
2311         // Rule (13)  Katakana x Katakana
2312         if (fKatakanaSet->contains(c1) &&
2313             fKatakanaSet->contains(c2))  {
2314             continue;
2315         }
2316
2317         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2318         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2319              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2320              fExtendNumLetSet->contains(c2)) {
2321                 continue;
2322         }
2323
2324         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2325         if (fExtendNumLetSet->contains(c1) &&
2326                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2327                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2328             continue;
2329         }
2330
2331         // Rule 13c
2332         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2333             continue;
2334         }
2335
2336         // Rule 14.  Break found here.
2337         break;
2338     }
2339
2340     breakPos = p2;
2341     return breakPos;
2342 }
2343
2344
2345 UVector  *RBBIWordMonkey::charClasses() {
2346     return fSets;
2347 }
2348
2349
2350 RBBIWordMonkey::~RBBIWordMonkey() {
2351     delete fSets;
2352     delete fCRSet;
2353     delete fLFSet;
2354     delete fNewlineSet;
2355     delete fKatakanaSet;
2356     delete fHebrew_LetterSet;
2357     delete fALetterSet;
2358     delete fSingle_QuoteSet;
2359     delete fDouble_QuoteSet;
2360     delete fMidNumLetSet;
2361     delete fMidLetterSet;
2362     delete fMidNumSet;
2363     delete fNumericSet;
2364     delete fFormatSet;
2365     delete fExtendSet;
2366     delete fExtendNumLetSet;
2367     delete fRegionalIndicatorSet;
2368     delete fDictionaryCjkSet;
2369     delete fOtherSet;
2370 }
2371
2372
2373
2374
2375 //------------------------------------------------------------------------------------------
2376 //
2377 //   class RBBISentMonkey      Sentence Break specific implementation
2378 //                             of RBBIMonkeyKind.
2379 //
2380 //------------------------------------------------------------------------------------------
2381 class RBBISentMonkey: public RBBIMonkeyKind {
2382 public:
2383     RBBISentMonkey();
2384     virtual          ~RBBISentMonkey();
2385     virtual  UVector *charClasses();
2386     virtual  void     setText(const UnicodeString &s);
2387     virtual int32_t   next(int32_t i);
2388 private:
2389     int               moveBack(int posFrom);
2390     int               moveForward(int posFrom);
2391     UChar32           cAt(int pos);
2392
2393     UVector      *fSets;
2394
2395     UnicodeSet  *fSepSet;
2396     UnicodeSet  *fFormatSet;
2397     UnicodeSet  *fSpSet;
2398     UnicodeSet  *fLowerSet;
2399     UnicodeSet  *fUpperSet;
2400     UnicodeSet  *fOLetterSet;
2401     UnicodeSet  *fNumericSet;
2402     UnicodeSet  *fATermSet;
2403     UnicodeSet  *fSContinueSet;
2404     UnicodeSet  *fSTermSet;
2405     UnicodeSet  *fCloseSet;
2406     UnicodeSet  *fOtherSet;
2407     UnicodeSet  *fExtendSet;
2408
2409     const UnicodeString  *fText;
2410
2411 };
2412
2413 RBBISentMonkey::RBBISentMonkey()
2414 {
2415     UErrorCode  status = U_ZERO_ERROR;
2416
2417     fSets            = new UVector(status);
2418
2419     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2420     //                       set and made into character classes of their own.  For the monkey impl,
2421     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2422     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2423     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2424     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2425     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2426     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2427     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2428     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2429     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2430     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2431     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2432     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2433     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2434     fOtherSet        = new UnicodeSet();
2435
2436     if(U_FAILURE(status)) {
2437       deferredStatus = status;
2438       return;
2439     }
2440
2441     fOtherSet->complement();
2442     fOtherSet->removeAll(*fSepSet);
2443     fOtherSet->removeAll(*fFormatSet);
2444     fOtherSet->removeAll(*fSpSet);
2445     fOtherSet->removeAll(*fLowerSet);
2446     fOtherSet->removeAll(*fUpperSet);
2447     fOtherSet->removeAll(*fOLetterSet);
2448     fOtherSet->removeAll(*fNumericSet);
2449     fOtherSet->removeAll(*fATermSet);
2450     fOtherSet->removeAll(*fSContinueSet);
2451     fOtherSet->removeAll(*fSTermSet);
2452     fOtherSet->removeAll(*fCloseSet);
2453     fOtherSet->removeAll(*fExtendSet);
2454
2455     fSets->addElement(fSepSet,       status);
2456     fSets->addElement(fFormatSet,    status);
2457     fSets->addElement(fSpSet,        status);
2458     fSets->addElement(fLowerSet,     status);
2459     fSets->addElement(fUpperSet,     status);
2460     fSets->addElement(fOLetterSet,   status);
2461     fSets->addElement(fNumericSet,   status);
2462     fSets->addElement(fATermSet,     status);
2463     fSets->addElement(fSContinueSet, status);
2464     fSets->addElement(fSTermSet,     status);
2465     fSets->addElement(fCloseSet,     status);
2466     fSets->addElement(fOtherSet,     status);
2467     fSets->addElement(fExtendSet,    status);
2468
2469     if (U_FAILURE(status)) {
2470         deferredStatus = status;
2471     }
2472 }
2473
2474
2475
2476 void RBBISentMonkey::setText(const UnicodeString &s) {
2477     fText       = &s;
2478 }
2479
2480 UVector  *RBBISentMonkey::charClasses() {
2481     return fSets;
2482 }
2483
2484
2485 //  moveBack()   Find the "significant" code point preceding the index i.
2486 //               Skips over ($Extend | $Format)* .
2487 //
2488 int RBBISentMonkey::moveBack(int i) {
2489     if (i <= 0) {
2490         return -1;
2491     }
2492     UChar32   c;
2493     int32_t   j = i;
2494     do {
2495         j = fText->moveIndex32(j, -1);
2496         c = fText->char32At(j);
2497     }
2498     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2499     return j;
2500
2501  }
2502
2503
2504 int RBBISentMonkey::moveForward(int i) {
2505     if (i>=fText->length()) {
2506         return fText->length();
2507     }
2508     UChar32   c;
2509     int32_t   j = i;
2510     do {
2511         j = fText->moveIndex32(j, 1);
2512         c = cAt(j);
2513     }
2514     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2515     return j;
2516 }
2517
2518 UChar32 RBBISentMonkey::cAt(int pos) {
2519     if (pos<0 || pos>=fText->length()) {
2520         return -1;
2521     } else {
2522         return fText->char32At(pos);
2523     }
2524 }
2525
2526 int32_t RBBISentMonkey::next(int32_t prevPos) {
2527     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2528                               //   break position being tested.  The candidate break
2529                               //   location is before p2.
2530
2531     int     breakPos = -1;
2532
2533     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2534     UChar32 c;
2535
2536     if (U_FAILURE(deferredStatus)) {
2537         return -1;
2538     }
2539
2540     // Prev break at end of string.  return DONE.
2541     if (prevPos >= fText->length()) {
2542         return -1;
2543     }
2544     p0 = p1 = p2 = p3 = prevPos;
2545     c3 =  fText->char32At(prevPos);
2546     c0 = c1 = c2 = 0;
2547     (void)p0;     // Suppress set but not used warning.
2548
2549     // Loop runs once per "significant" character position in the input text.
2550     for (;;) {
2551         // Move all of the positions forward in the input string.
2552         p0 = p1;  c0 = c1;
2553         p1 = p2;  c1 = c2;
2554         p2 = p3;  c2 = c3;
2555
2556         // Advancd p3 by    X(Extend | Format)*   Rule 4
2557         p3 = moveForward(p3);
2558         c3 = cAt(p3);
2559
2560         // Rule (3)  CR x LF
2561         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2562             continue;
2563         }
2564
2565         // Rule (4).   Sep  <break>
2566         if (fSepSet->contains(c1)) {
2567             p2 = p1+1;   // Separators don't combine with Extend or Format.
2568             break;
2569         }
2570
2571         if (p2 >= fText->length()) {
2572             // Reached end of string.  Always a break position.
2573             break;
2574         }
2575
2576         if (p2 == prevPos) {
2577             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2578             continue;
2579         }
2580
2581         // Rule (6).   ATerm x Numeric
2582         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2583             continue;
2584         }
2585
2586         // Rule (7).  Upper ATerm  x  Uppper
2587         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2588             continue;
2589         }
2590
2591         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2592         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2593         //                  note to the Unicode 5.0 documents.
2594         int p8 = p1;
2595         while (fSpSet->contains(cAt(p8))) {
2596             p8 = moveBack(p8);
2597         }
2598         while (fCloseSet->contains(cAt(p8))) {
2599             p8 = moveBack(p8);
2600         }
2601         if (fATermSet->contains(cAt(p8))) {
2602             p8=p2;
2603             for (;;) {
2604                 c = cAt(p8);
2605                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2606                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2607                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2608                     break;
2609                 }
2610                 p8 = moveForward(p8);
2611             }
2612             if (fLowerSet->contains(cAt(p8))) {
2613                 continue;
2614             }
2615         }
2616
2617         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2618         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2619             p8 = p1;
2620             while (fSpSet->contains(cAt(p8))) {
2621                 p8 = moveBack(p8);
2622             }
2623             while (fCloseSet->contains(cAt(p8))) {
2624                 p8 = moveBack(p8);
2625             }
2626             c = cAt(p8);
2627             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2628                 continue;
2629             }
2630         }
2631
2632         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2633         int p9 = p1;
2634         while (fCloseSet->contains(cAt(p9))) {
2635             p9 = moveBack(p9);
2636         }
2637         c = cAt(p9);
2638         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2639             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2640                 continue;
2641             }
2642         }
2643
2644         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2645         int p10 = p1;
2646         while (fSpSet->contains(cAt(p10))) {
2647             p10 = moveBack(p10);
2648         }
2649         while (fCloseSet->contains(cAt(p10))) {
2650             p10 = moveBack(p10);
2651         }
2652         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2653             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2654                 continue;
2655             }
2656         }
2657
2658         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2659         int p11 = p1;
2660         if (fSepSet->contains(cAt(p11))) {
2661             p11 = moveBack(p11);
2662         }
2663         while (fSpSet->contains(cAt(p11))) {
2664             p11 = moveBack(p11);
2665         }
2666         while (fCloseSet->contains(cAt(p11))) {
2667             p11 = moveBack(p11);
2668         }
2669         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2670             break;
2671         }
2672
2673         //  Rule (12)  Any x Any
2674         continue;
2675     }
2676     breakPos = p2;
2677     return breakPos;
2678 }
2679
2680 RBBISentMonkey::~RBBISentMonkey() {
2681     delete fSets;
2682     delete fSepSet;
2683     delete fFormatSet;
2684     delete fSpSet;
2685     delete fLowerSet;
2686     delete fUpperSet;
2687     delete fOLetterSet;
2688     delete fNumericSet;
2689     delete fATermSet;
2690     delete fSContinueSet;
2691     delete fSTermSet;
2692     delete fCloseSet;
2693     delete fOtherSet;
2694     delete fExtendSet;
2695 }
2696
2697
2698
2699 //-------------------------------------------------------------------------------------------
2700 //
2701 //  RBBILineMonkey
2702 //
2703 //-------------------------------------------------------------------------------------------
2704
2705 class RBBILineMonkey: public RBBIMonkeyKind {
2706 public:
2707     RBBILineMonkey();
2708     virtual          ~RBBILineMonkey();
2709     virtual  UVector *charClasses();
2710     virtual  void     setText(const UnicodeString &s);
2711     virtual  int32_t  next(int32_t i);
2712     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2713 private:
2714     UVector      *fSets;
2715
2716     UnicodeSet  *fBK;
2717     UnicodeSet  *fCR;
2718     UnicodeSet  *fLF;
2719     UnicodeSet  *fCM;
2720     UnicodeSet  *fNL;
2721     UnicodeSet  *fSG;
2722     UnicodeSet  *fWJ;
2723     UnicodeSet  *fZW;
2724     UnicodeSet  *fGL;
2725     UnicodeSet  *fCB;
2726     UnicodeSet  *fSP;
2727     UnicodeSet  *fB2;
2728     UnicodeSet  *fBA;
2729     UnicodeSet  *fBB;
2730     UnicodeSet  *fHY;
2731     UnicodeSet  *fH2;
2732     UnicodeSet  *fH3;
2733     UnicodeSet  *fCL;
2734     UnicodeSet  *fCP;
2735     UnicodeSet  *fEX;
2736     UnicodeSet  *fIN;
2737     UnicodeSet  *fJL;
2738     UnicodeSet  *fJV;
2739     UnicodeSet  *fJT;
2740     UnicodeSet  *fNS;
2741     UnicodeSet  *fOP;
2742     UnicodeSet  *fQU;
2743     UnicodeSet  *fIS;
2744     UnicodeSet  *fNU;
2745     UnicodeSet  *fPO;
2746     UnicodeSet  *fPR;
2747     UnicodeSet  *fSY;
2748     UnicodeSet  *fAI;
2749     UnicodeSet  *fAL;
2750     UnicodeSet  *fCJ;
2751     UnicodeSet  *fHL;
2752     UnicodeSet  *fID;
2753     UnicodeSet  *fRI;
2754     UnicodeSet  *fSA;
2755     UnicodeSet  *fXX;
2756
2757     BreakIterator        *fCharBI;
2758     const UnicodeString  *fText;
2759     RegexMatcher         *fNumberMatcher;
2760 };
2761
2762
2763 RBBILineMonkey::RBBILineMonkey()
2764 {
2765     UErrorCode  status = U_ZERO_ERROR;
2766
2767     fSets  = new UVector(status);
2768
2769     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2770     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2771     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2772     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2773     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2774     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2775     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2776     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2777     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2778     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2779     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2780     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2781     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2782     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2783     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2784     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2785     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2786     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2787     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2788     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2789     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2790     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2791     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2792     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2793     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2794     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2795     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2796     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2797     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2798     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2799     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2800     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2801     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2802     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2803     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2804     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2805     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2806     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2807     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2808     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2809
2810     if (U_FAILURE(status)) {
2811         deferredStatus = status;
2812         fCharBI = NULL;
2813         fNumberMatcher = NULL;
2814         return;
2815     }
2816
2817     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2818     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2819     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2820     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2821
2822     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2823
2824     fSets->addElement(fBK, status);
2825     fSets->addElement(fCR, status);
2826     fSets->addElement(fLF, status);
2827     fSets->addElement(fCM, status);
2828     fSets->addElement(fNL, status);
2829     fSets->addElement(fWJ, status);
2830     fSets->addElement(fZW, status);
2831     fSets->addElement(fGL, status);
2832     fSets->addElement(fCB, status);
2833     fSets->addElement(fSP, status);
2834     fSets->addElement(fB2, status);
2835     fSets->addElement(fBA, status);
2836     fSets->addElement(fBB, status);
2837     fSets->addElement(fHY, status);
2838     fSets->addElement(fH2, status);
2839     fSets->addElement(fH3, status);
2840     fSets->addElement(fCL, status);
2841     fSets->addElement(fCP, status);
2842     fSets->addElement(fEX, status);
2843     fSets->addElement(fIN, status);
2844     fSets->addElement(fJL, status);
2845     fSets->addElement(fJT, status);
2846     fSets->addElement(fJV, status);
2847     fSets->addElement(fNS, status);
2848     fSets->addElement(fOP, status);
2849     fSets->addElement(fQU, status);
2850     fSets->addElement(fIS, status);
2851     fSets->addElement(fNU, status);
2852     fSets->addElement(fPO, status);
2853     fSets->addElement(fPR, status);
2854     fSets->addElement(fSY, status);
2855     fSets->addElement(fAI, status);
2856     fSets->addElement(fAL, status);
2857     fSets->addElement(fHL, status);
2858     fSets->addElement(fID, status);
2859     fSets->addElement(fWJ, status);
2860     fSets->addElement(fRI, status);
2861     fSets->addElement(fSA, status);
2862     fSets->addElement(fSG, status);
2863
2864     const char *rules =
2865             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2866             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2867             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2868             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2869             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2870             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2871
2872     fNumberMatcher = new RegexMatcher(
2873         UnicodeString(rules, -1, US_INV), 0, status);
2874
2875     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2876
2877     if (U_FAILURE(status)) {
2878         deferredStatus = status;
2879     }
2880 }
2881
2882
2883 void RBBILineMonkey::setText(const UnicodeString &s) {
2884     fText       = &s;
2885     fCharBI->setText(s);
2886     fNumberMatcher->reset(s);
2887 }
2888
2889 //
2890 //  rule9Adjust
2891 //     Line Break TR rules 9 and 10 implementation.
2892 //     This deals with combining marks and other sequences that
2893 //     that must be treated as if they were something other than what they actually are.
2894 //
2895 //     This is factored out into a separate function because it must be applied twice for
2896 //     each potential break, once to the chars before the position being checked, then
2897 //     again to the text following the possible break.
2898 //
2899 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2900     if (pos == -1) {
2901         // Invalid initial position.  Happens during the warmup iteration of the
2902         //   main loop in next().
2903         return;
2904     }
2905
2906     int32_t  nPos = *nextPos;
2907
2908     // LB 9  Keep combining sequences together.
2909     //  advance over any CM class chars.  Note that Line Break CM is different
2910     //  from the normal Grapheme Extend property.
2911     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
2912           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
2913         for (;;) {
2914             *nextChar = fText->char32At(nPos);
2915             if (!fCM->contains(*nextChar)) {
2916                 break;
2917             }
2918             nPos = fText->moveIndex32(nPos, 1);
2919         }
2920     }
2921
2922
2923     // LB 9 Treat X CM* as if it were x.
2924     //       No explicit action required.
2925
2926     // LB 10  Treat any remaining combining mark as AL
2927     if (fCM->contains(*posChar)) {
2928         *posChar = 0x41;   // thisChar = 'A';
2929     }
2930
2931     // Push the updated nextPos and nextChar back to our caller.
2932     // This only makes a difference if posChar got bigger by consuming a
2933     // combining sequence.
2934     *nextPos  = nPos;
2935     *nextChar = fText->char32At(nPos);
2936 }
2937
2938
2939
2940 int32_t RBBILineMonkey::next(int32_t startPos) {
2941     UErrorCode status = U_ZERO_ERROR;
2942     int32_t    pos;       //  Index of the char following a potential break position
2943     UChar32    thisChar;  //  Character at above position "pos"
2944
2945     int32_t    prevPos;   //  Index of the char preceding a potential break position
2946     UChar32    prevChar;  //  Character at above position.  Note that prevChar
2947                           //   and thisChar may not be adjacent because combining
2948                           //   characters between them will be ignored.
2949
2950     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
2951     UChar32    prevCharX2;
2952
2953     int32_t    nextPos;   //  Index of the next character following pos.
2954                           //     Usually skips over combining marks.
2955     int32_t    nextCPPos; //  Index of the code point following "pos."
2956                           //     May point to a combining mark.
2957     int32_t    tPos;      //  temp value.
2958     UChar32    c;
2959
2960     if (U_FAILURE(deferredStatus)) {
2961         return -1;
2962     }
2963
2964     if (startPos >= fText->length()) {
2965         return -1;
2966     }
2967
2968
2969     // Initial values for loop.  Loop will run the first time without finding breaks,
2970     //                           while the invalid values shift out and the "this" and
2971     //                           "prev" positions are filled in with good values.
2972     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
2973     thisChar = prevChar  = prevCharX2 = 0;
2974     nextPos  = nextCPPos = startPos;
2975
2976
2977     // Loop runs once per position in the test text, until a break position
2978     //  is found.
2979     for (;;) {
2980         prevPosX2 = prevPos;
2981         prevCharX2 = prevChar;
2982
2983         prevPos   = pos;
2984         prevChar  = thisChar;
2985
2986         pos       = nextPos;
2987         thisChar  = fText->char32At(pos);
2988
2989         nextCPPos = fText->moveIndex32(pos, 1);
2990         nextPos   = nextCPPos;
2991
2992         // Rule LB2 - Break at end of text.
2993         if (pos >= fText->length()) {
2994             break;
2995         }
2996
2997         // Rule LB 9 - adjust for combining sequences.
2998         //             We do this one out-of-order because the adjustment does not change anything
2999         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3000         //             be applied.
3001         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3002         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3003         c = fText->char32At(nextPos);
3004         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3005
3006         // If the loop is still warming up - if we haven't shifted the initial
3007         //   -1 positions out of prevPos yet - loop back to advance the
3008         //    position in the input without any further looking for breaks.
3009         if (prevPos == -1) {
3010             continue;
3011         }
3012
3013         // LB 4  Always break after hard line breaks,
3014         if (fBK->contains(prevChar)) {
3015             break;
3016         }
3017
3018         // LB 5  Break after CR, LF, NL, but not inside CR LF
3019         if (prevChar == 0x0d && thisChar == 0x0a) {
3020             continue;
3021         }
3022         if (prevChar == 0x0d ||
3023             prevChar == 0x0a ||
3024             prevChar == 0x85)  {
3025             break;
3026         }
3027
3028         // LB 6  Don't break before hard line breaks
3029         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3030             fBK->contains(thisChar)) {
3031                 continue;
3032         }
3033
3034
3035         // LB 7  Don't break before spaces or zero-width space.
3036         if (fSP->contains(thisChar)) {
3037             continue;
3038         }
3039
3040         if (fZW->contains(thisChar)) {
3041             continue;
3042         }
3043
3044         // LB 8  Break after zero width space
3045         if (fZW->contains(prevChar)) {
3046             break;
3047         }
3048
3049         // LB 9, 10  Already done, at top of loop.
3050         //
3051
3052
3053         // LB 11  Do not break before or after WORD JOINER and related characters.
3054         //    x  WJ
3055         //    WJ  x
3056         //
3057         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3058             continue;
3059         }
3060
3061         // LB 12
3062         //    GL  x
3063         if (fGL->contains(prevChar)) {
3064             continue;
3065         }
3066
3067         // LB 12a
3068         //    [^SP BA HY] x GL
3069         if (!(fSP->contains(prevChar) ||
3070               fBA->contains(prevChar) ||
3071               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3072             continue;
3073         }
3074
3075
3076
3077         // LB 13  Don't break before closings.
3078         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3079         //        fall into LB 17 and the more general number regular expression.
3080         //
3081         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3082             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3083                                          fEX->contains(thisChar)  ||
3084             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3085             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3086             continue;
3087         }
3088
3089         // LB 14 Don't break after OP SP*
3090         //       Scan backwards, checking for this sequence.
3091         //       The OP char could include combining marks, so we actually check for
3092         //           OP CM* SP*
3093         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3094         //       sequence into a ID char, so before scanning back through spaces,
3095         //       verify that prevChar is indeed a space.  The prevChar variable
3096         //       may differ from fText[prevPos]
3097         tPos = prevPos;
3098         if (fSP->contains(prevChar)) {
3099             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3100                 tPos=fText->moveIndex32(tPos, -1);
3101             }
3102         }
3103         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3104             tPos=fText->moveIndex32(tPos, -1);
3105         }
3106         if (fOP->contains(fText->char32At(tPos))) {
3107             continue;
3108         }
3109
3110
3111         // LB 15    QU SP* x OP
3112         if (fOP->contains(thisChar)) {
3113             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3114             int tPos = prevPos;
3115             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3116                 tPos = fText->moveIndex32(tPos, -1);
3117             }
3118             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3119                 tPos = fText->moveIndex32(tPos, -1);
3120             }
3121             if (fQU->contains(fText->char32At(tPos))) {
3122                 continue;
3123             }
3124         }
3125
3126
3127
3128         // LB 16   (CL | CP) SP* x NS
3129         //    Scan backwards for SP* CM* (CL | CP)
3130         if (fNS->contains(thisChar)) {
3131             int tPos = prevPos;
3132             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3133                 tPos = fText->moveIndex32(tPos, -1);
3134             }
3135             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3136                 tPos = fText->moveIndex32(tPos, -1);
3137             }
3138             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3139                 continue;
3140             }
3141         }
3142
3143
3144         // LB 17        B2 SP* x B2
3145         if (fB2->contains(thisChar)) {
3146             //  Scan backwards, checking for the B2 CM* SP* sequence.
3147             tPos = prevPos;
3148             if (fSP->contains(prevChar)) {
3149                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3150                     tPos=fText->moveIndex32(tPos, -1);
3151                 }
3152             }
3153             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3154                 tPos=fText->moveIndex32(tPos, -1);
3155             }
3156             if (fB2->contains(fText->char32At(tPos))) {
3157                 continue;
3158             }
3159         }
3160
3161
3162         // LB 18    break after space
3163         if (fSP->contains(prevChar)) {
3164             break;
3165         }
3166
3167         // LB 19
3168         //    x   QU
3169         //    QU  x
3170         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3171             continue;
3172         }
3173
3174         // LB 20  Break around a CB
3175         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3176             break;
3177         }
3178
3179         // LB 21
3180         if (fBA->contains(thisChar) ||
3181             fHY->contains(thisChar) ||
3182             fNS->contains(thisChar) ||
3183             fBB->contains(prevChar) )   {
3184             continue;
3185         }
3186
3187         // LB 21a
3188         //   HL (HY | BA) x
3189         if (fHL->contains(prevCharX2) &&
3190                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3191             continue;
3192         }
3193
3194         // LB 21b
3195         //   SY x HL
3196         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3197             continue;
3198         }
3199
3200         // LB 22
3201         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3202             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3203             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3204             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3205             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3206             continue;
3207         }
3208
3209
3210         // LB 23    ID x PO
3211         //          AL x NU
3212         //          HL x NU
3213         //          NU x AL
3214         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3215             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3216             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3217             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3218             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3219             continue;
3220         }
3221
3222         // LB 24  Do not break between prefix and letters or ideographs.
3223         //        PR x ID
3224         //        PR x (AL | HL)
3225         //        PO x (AL | HL)
3226         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3227             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3228             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3229             continue;
3230         }
3231
3232
3233
3234         // LB 25    Numbers
3235         if (fNumberMatcher->lookingAt(prevPos, status)) {
3236             if (U_FAILURE(status)) {
3237                 break;
3238             }
3239             // Matched a number.  But could have been just a single digit, which would
3240             //    not represent a "no break here" between prevChar and thisChar
3241             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3242             if (numEndIdx > pos) {
3243                 // Number match includes at least our two chars being checked
3244                 if (numEndIdx > nextPos) {
3245                     // Number match includes additional chars.  Update pos and nextPos
3246                     //   so that next loop iteration will continue at the end of the number,
3247                     //   checking for breaks between last char in number & whatever follows.
3248                     pos = nextPos = numEndIdx;
3249                     do {
3250                         pos = fText->moveIndex32(pos, -1);
3251                         thisChar = fText->char32At(pos);
3252                     } while (fCM->contains(thisChar));
3253                 }
3254                 continue;
3255             }
3256         }
3257
3258
3259         // LB 26 Do not break a Korean syllable.
3260         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3261                                         fJV->contains(thisChar) ||
3262                                         fH2->contains(thisChar) ||
3263                                         fH3->contains(thisChar))) {
3264                                             continue;
3265                                         }
3266
3267         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3268             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3269                 continue;
3270         }
3271
3272         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3273             fJT->contains(thisChar)) {
3274                 continue;
3275         }
3276
3277         // LB 27 Treat a Korean Syllable Block the same as ID.
3278         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3279             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3280             fIN->contains(thisChar)) {
3281                 continue;
3282             }
3283         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3284             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3285             fPO->contains(thisChar)) {
3286                 continue;
3287             }
3288         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3289             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3290                 continue;
3291             }
3292
3293
3294
3295         // LB 28  Do not break between alphabetics ("at").
3296         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3297             continue;
3298         }
3299
3300         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3301         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3302             continue;
3303         }
3304
3305         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3306         //          (AL | NU) x OP
3307         //          CP x (AL | NU)
3308         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3309             continue;
3310         }
3311         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3312             continue;
3313         }
3314
3315         // LB30a  Do not break between regional indicators.
3316         //        RI x RI
3317         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3318             continue;
3319         }
3320
3321         // LB 31    Break everywhere else
3322         break;
3323
3324     }
3325
3326     return pos;
3327 }
3328
3329
3330 UVector  *RBBILineMonkey::charClasses() {
3331     return fSets;
3332 }
3333
3334
3335 RBBILineMonkey::~RBBILineMonkey() {
3336     delete fSets;
3337
3338     delete fBK;
3339     delete fCR;
3340     delete fLF;
3341     delete fCM;
3342     delete fNL;
3343     delete fWJ;
3344     delete fZW;
3345     delete fGL;
3346     delete fCB;
3347     delete fSP;
3348     delete fB2;
3349     delete fBA;
3350     delete fBB;
3351     delete fHY;
3352     delete fH2;
3353     delete fH3;
3354     delete fCL;
3355     delete fCP;
3356     delete fEX;
3357     delete fIN;
3358     delete fJL;
3359     delete fJV;
3360     delete fJT;
3361     delete fNS;
3362     delete fOP;
3363     delete fQU;
3364     delete fIS;
3365     delete fNU;
3366     delete fPO;
3367     delete fPR;
3368     delete fSY;
3369     delete fAI;
3370     delete fAL;
3371     delete fCJ;
3372     delete fHL;
3373     delete fID;
3374     delete fRI;
3375     delete fSA;
3376     delete fSG;
3377     delete fXX;
3378
3379     delete fCharBI;
3380     delete fNumberMatcher;
3381 }
3382
3383
3384 //-------------------------------------------------------------------------------------------
3385 //
3386 //   TestMonkey
3387 //
3388 //     params
3389 //       seed=nnnnn        Random number starting seed.
3390 //                         Setting the seed allows errors to be reproduced.
3391 //       loop=nnn          Looping count.  Controls running time.
3392 //                         -1:  run forever.
3393 //                          0 or greater:  run length.
3394 //
3395 //       type = char | word | line | sent | title
3396 //
3397 //-------------------------------------------------------------------------------------------
3398
3399 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3400     int32_t val = defaultVal;
3401     name.append(" *= *(-?\\d+)");
3402     UErrorCode status = U_ZERO_ERROR;
3403     RegexMatcher m(name, params, 0, status);
3404     if (m.find()) {
3405         // The param exists.  Convert the string to an int.
3406         char valString[100];
3407         int32_t paramLength = m.end(1, status) - m.start(1, status);
3408         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3409             paramLength = (int32_t)(sizeof(valString)-2);
3410         }
3411         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3412         val = strtol(valString,  NULL, 10);
3413
3414         // Delete this parameter from the params string.
3415         m.reset();
3416         params = m.replaceFirst("", status);
3417     }
3418     U_ASSERT(U_SUCCESS(status));
3419     return val;
3420 }
3421 #endif
3422
3423 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3424 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3425                                     BreakIterator *bi,
3426                                     int expected[],
3427                                     int expectedcount)
3428 {
3429     int count = 0;
3430     int i = 0;
3431     int forward[50];
3432     bi->setText(ustr);
3433     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3434         forward[count] = i;
3435         if (count < expectedcount && expected[count] != i) {
3436             test->errln("break forward test failed: expected %d but got %d",
3437                         expected[count], i);
3438             break;
3439         }
3440         count ++;
3441     }
3442     if (count != expectedcount) {
3443         printStringBreaks(ustr, expected, expectedcount);
3444         test->errln("break forward test failed: missed %d match",
3445                     expectedcount - count);
3446         return;
3447     }
3448     // testing boundaries
3449     for (i = 1; i < expectedcount; i ++) {
3450         int j = expected[i - 1];
3451         if (!bi->isBoundary(j)) {
3452             printStringBreaks(ustr, expected, expectedcount);
3453             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3454             return;
3455         }
3456         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3457             if (bi->isBoundary(j)) {
3458                 printStringBreaks(ustr, expected, expectedcount);
3459                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3460                 return;
3461             }
3462         }
3463     }
3464
3465     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3466         count --;
3467         if (forward[count] != i) {
3468             printStringBreaks(ustr, expected, expectedcount);
3469             test->errln("happy break test previous() failed: expected %d but got %d",
3470                         forward[count], i);
3471             break;
3472         }
3473     }
3474     if (count != 0) {
3475         printStringBreaks(ustr, expected, expectedcount);
3476         test->errln("break test previous() failed: missed a match");
3477         return;
3478     }
3479
3480     // testing preceding
3481     for (i = 0; i < expectedcount - 1; i ++) {
3482         // int j = expected[i] + 1;
3483         int j = ustr.moveIndex32(expected[i], 1);
3484         for (; j <= expected[i + 1]; j ++) {
3485             if (bi->preceding(j) != expected[i]) {
3486                 printStringBreaks(ustr, expected, expectedcount);
3487                 test->errln("preceding(): Not expecting boundary at position %d", j);
3488                 return;
3489             }
3490         }
3491     }
3492 }
3493 #endif
3494
3495 void RBBITest::TestWordBreaks(void)
3496 {
3497 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3498
3499     Locale        locale("en");
3500     UErrorCode    status = U_ZERO_ERROR;
3501     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3502     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3503     // Replaced any C+J characters in a row with a random sequence of characters
3504     // of the same length to make our C+J segmentation not get in the way.
3505     static const char *strlist[] =
3506     {
3507     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3508     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3509     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3510     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3511     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3512     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3513     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3514     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3515     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3516     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3517     "\\u2027\\U000e0067\\u0a47\\u00b7",
3518     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3519     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3520     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3521     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3522     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3523     "\\u0027\\u11af\\U000e0057\\u0602",
3524     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3525     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3526     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3527     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3528     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3529     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3530     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3531     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3532     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3533     "\\u18f4\\U000e0049\\u20e7\\u2027",
3534     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3535     "\\ua183\\u102d\\u0bec\\u003a",
3536     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3537     "\\u003a\\u0e57\\u0fad\\u002e",
3538     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3539     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3540     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3541     "\\u003a\\u0664\\u00b7\\u1fba",
3542     "\\u003b\\u0027\\u00b7\\u47a3",
3543     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3544     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3545     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3546     };
3547     int loop;
3548     if (U_FAILURE(status)) {
3549         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3550         return;
3551     }
3552     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3553         // printf("looping %d\n", loop);
3554         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3555         // RBBICharMonkey monkey;
3556         RBBIWordMonkey monkey;
3557
3558         int expected[50];
3559         int expectedcount = 0;
3560
3561         monkey.setText(ustr);
3562         int i;
3563         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3564             expected[expectedcount ++] = i;
3565         }
3566
3567         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3568     }
3569     delete bi;
3570 #endif
3571 }
3572
3573 void RBBITest::TestWordBoundary(void)
3574 {
3575     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3576     Locale        locale("en");
3577     UErrorCode    status = U_ZERO_ERROR;
3578     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3579     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3580     UChar         str[50];
3581     static const char *strlist[] =
3582     {
3583     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3584     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3585     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3586     "\\u2027\\U000e0067\\u0a47\\u00b7",
3587     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3588     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3589     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3590     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3591     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3592     "\\u0027\\u11af\\U000e0057\\u0602",
3593     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3594     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3595     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3596     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3597     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3598     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3599     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3600     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3601     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3602     "\\u58f4\\U000e0049\\u20e7\\u2027",
3603     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3604     "\\ua183\\u102d\\u0bec\\u003a",
3605     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3606     "\\u003a\\u0e57\\u0fad\\u002e",
3607     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3608     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3609     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3610     "\\u003a\\u0664\\u00b7\\u1fba",
3611     "\\u003b\\u0027\\u00b7\\u47a3",
3612     };
3613     int loop;
3614     if (U_FAILURE(status)) {
3615         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3616         return;
3617     }
3618     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3619         // printf("looping %d\n", loop);
3620         u_unescape(strlist[loop], str, 20);
3621         UnicodeString ustr(str);
3622         int forward[50];
3623         int count = 0;
3624
3625         bi->setText(ustr);
3626         int prev = 0;
3627         int i;
3628         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3629             forward[count ++] = i;
3630             if (i > prev) {
3631                 int j;
3632                 for (j = prev + 1; j < i; j ++) {
3633                     if (bi->isBoundary(j)) {
3634                         printStringBreaks(ustr, forward, count);
3635                         errln("happy boundary test failed: expected %d not a boundary",
3636                                j);
3637                         return;
3638                     }
3639                 }
3640             }
3641             if (!bi->isBoundary(i)) {
3642                 printStringBreaks(ustr, forward, count);
3643                 errln("happy boundary test failed: expected %d a boundary",
3644                        i);
3645                 return;
3646             }
3647             prev = i;
3648         }
3649     }
3650     delete bi;
3651 }
3652
3653 void RBBITest::TestLineBreaks(void)
3654 {
3655 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3656     Locale        locale("en");
3657     UErrorCode    status = U_ZERO_ERROR;
3658     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3659     const int32_t  STRSIZE = 50;
3660     UChar         str[STRSIZE];
3661     static const char *strlist[] =
3662     {
3663      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3664      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3665              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3666      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3667              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3668      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3669      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3670      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3671      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3672      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3673      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3674      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3675      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3676      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3677      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3678      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3679      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3680      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3681      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3682      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3683      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3684      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3685      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3686      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3687      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3688      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3689      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3690      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3691      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3692      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3693      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3694      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3695      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3696      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3697      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3698      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3699      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3700      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3701      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3702      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3703      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3704      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3705          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3706          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3707          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3708      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3709          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3710     };
3711     int loop;
3712     TEST_ASSERT_SUCCESS(status);
3713     if (U_FAILURE(status)) {
3714         return;
3715     }
3716     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3717         // printf("looping %d\n", loop);
3718         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3719         if (t >= STRSIZE) {
3720             TEST_ASSERT(FALSE);
3721             continue;
3722         }
3723
3724
3725         UnicodeString ustr(str);
3726         RBBILineMonkey monkey;
3727         if (U_FAILURE(monkey.deferredStatus)) {
3728             continue;
3729         }
3730
3731         const int EXPECTEDSIZE = 50;
3732         int expected[EXPECTEDSIZE];
3733         int expectedcount = 0;
3734
3735         monkey.setText(ustr);
3736         int i;
3737         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3738             if (expectedcount >= EXPECTEDSIZE) {
3739                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3740                 return;
3741             }
3742             expected[expectedcount ++] = i;
3743         }
3744
3745         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3746     }
3747     delete bi;
3748 #endif
3749 }
3750
3751 void RBBITest::TestSentBreaks(void)
3752 {
3753 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3754     Locale        locale("en");
3755     UErrorCode    status = U_ZERO_ERROR;
3756     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3757     UChar         str[200];
3758     static const char *strlist[] =
3759     {
3760      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3761      "This\n",
3762      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3763      "\"Sentence ending with a quote.\" Bye.",
3764      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3765      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3766      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3767      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3768      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3769      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3770      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3771              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3772              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3773              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3774      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3775              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3776              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3777              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3778              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3779              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3780     };
3781     int loop;
3782     if (U_FAILURE(status)) {
3783         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3784         return;
3785     }
3786     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3787         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3788         UnicodeString ustr(str);
3789
3790         RBBISentMonkey monkey;
3791         if (U_FAILURE(monkey.deferredStatus)) {
3792             continue;
3793         }
3794
3795         const int EXPECTEDSIZE = 50;
3796         int expected[EXPECTEDSIZE];
3797         int expectedcount = 0;
3798
3799         monkey.setText(ustr);
3800         int i;
3801         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3802             if (expectedcount >= EXPECTEDSIZE) {
3803                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3804                 return;
3805             }
3806             expected[expectedcount ++] = i;
3807         }
3808
3809         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3810     }
3811     delete bi;
3812 #endif
3813 }
3814
3815 void RBBITest::TestMonkey(char *params) {
3816 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3817
3818     UErrorCode     status    = U_ZERO_ERROR;
3819     int32_t        loopCount = 500;
3820     int32_t        seed      = 1;
3821     UnicodeString  breakType = "all";
3822     Locale         locale("en");
3823     UBool          useUText  = FALSE;
3824
3825     if (quick == FALSE) {
3826         loopCount = 10000;
3827     }
3828
3829     if (params) {
3830         UnicodeString p(params);
3831         loopCount = getIntParam("loop", p, loopCount);
3832         seed      = getIntParam("seed", p, seed);
3833
3834         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3835         if (m.find()) {
3836             breakType = m.group(1, status);
3837             m.reset();
3838             p = m.replaceFirst("", status);
3839         }
3840
3841         RegexMatcher u(" *utext", p, 0, status);
3842         if (u.find()) {
3843             useUText = TRUE;
3844             u.reset();
3845             p = u.replaceFirst("", status);
3846         }
3847
3848
3849         // m.reset(p);
3850         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3851             // Each option is stripped out of the option string as it is processed.
3852             // All options have been checked.  The option string should have been completely emptied..
3853             char buf[100];
3854             p.extract(buf, sizeof(buf), NULL, status);
3855             buf[sizeof(buf)-1] = 0;
3856             errln("Unrecognized or extra parameter:  %s\n", buf);
3857             return;
3858         }
3859
3860     }
3861
3862     if (breakType == "char" || breakType == "all") {
3863         RBBICharMonkey  m;
3864         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3865         if (U_SUCCESS(status)) {
3866             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3867             if (breakType == "all" && useUText==FALSE) {
3868                 // Also run a quick test with UText when "all" is specified
3869                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3870             }
3871         }
3872         else {
3873             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3874         }
3875         delete bi;
3876     }
3877
3878     if (breakType == "word" || breakType == "all") {
3879         logln("Word Break Monkey Test");
3880         RBBIWordMonkey  m;
3881         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3882         if (U_SUCCESS(status)) {
3883             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3884         }
3885         else {
3886             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3887         }
3888         delete bi;
3889     }
3890
3891     if (breakType == "line" || breakType == "all") {
3892         logln("Line Break Monkey Test");
3893         RBBILineMonkey  m;
3894         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3895         if (loopCount >= 10) {
3896             loopCount = loopCount / 5;   // Line break runs slower than the others.
3897         }
3898         if (U_SUCCESS(status)) {
3899             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3900         }
3901         else {
3902             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3903         }
3904         delete bi;
3905     }
3906
3907     if (breakType == "sent" || breakType == "all"  ) {
3908         logln("Sentence Break Monkey Test");
3909         RBBISentMonkey  m;
3910         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3911         if (loopCount >= 10) {
3912             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3913         }
3914         if (U_SUCCESS(status)) {
3915             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3916         }
3917         else {
3918             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3919         }
3920         delete bi;
3921     }
3922
3923 #endif
3924 }
3925
3926 //
3927 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
3928 //    Parameters:
3929 //       bi      - the break iterator to use
3930 //       mk      - MonkeyKind, abstraction for obtaining expected results
3931 //       name    - Name of test (char, word, etc.) for use in error messages
3932 //       seed    - Seed for starting random number generator (parameter from user)
3933 //       numIterations
3934 //
3935 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
3936                          int32_t numIterations, UBool useUText) {
3937
3938 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3939
3940     const int32_t    TESTSTRINGLEN = 500;
3941     UnicodeString    testText;
3942     int32_t          numCharClasses;
3943     UVector          *chClasses;
3944     int              expected[TESTSTRINGLEN*2 + 1];
3945     int              expectedCount = 0;
3946     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
3947     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
3948     char             reverseBreaks[TESTSTRINGLEN*2+1];
3949     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
3950     char             followingBreaks[TESTSTRINGLEN*2+1];
3951     char             precedingBreaks[TESTSTRINGLEN*2+1];
3952     int              i;
3953     int              loopCount = 0;
3954
3955     m_seed = seed;
3956
3957     numCharClasses = mk.charClasses()->size();
3958     chClasses      = mk.charClasses();
3959
3960     // Check for errors that occured during the construction of the MonkeyKind object.
3961     //  Can't report them where they occured because errln() is a method coming from intlTest,
3962     //  and is not visible outside of RBBITest :-(
3963     if (U_FAILURE(mk.deferredStatus)) {
3964         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
3965         return;
3966     }
3967
3968     // Verify that the character classes all have at least one member.
3969     for (i=0; i<numCharClasses; i++) {
3970         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
3971         if (s == NULL || s->size() == 0) {
3972             errln("Character Class #%d is null or of zero size.", i);
3973             return;
3974         }
3975     }
3976
3977     while (loopCount < numIterations || numIterations == -1) {
3978         if (numIterations == -1 && loopCount % 10 == 0) {
3979             // If test is running in an infinite loop, display a periodic tic so
3980             //   we can tell that it is making progress.
3981             fprintf(stderr, ".");
3982         }
3983         // Save current random number seed, so that we can recreate the random numbers
3984         //   for this loop iteration in event of an error.
3985         seed = m_seed;
3986
3987         // Populate a test string with data.
3988         testText.truncate(0);
3989         for (i=0; i<TESTSTRINGLEN; i++) {
3990             int32_t  aClassNum = m_rand() % numCharClasses;
3991             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
3992             int32_t   charIdx = m_rand() % classSet->size();
3993             UChar32   c = classSet->charAt(charIdx);
3994             if (c < 0) {   // TODO:  deal with sets containing strings.
3995                 errln("c < 0");
3996                 break;
3997             }
3998             testText.append(c);
3999         }
4000
4001         // Calculate the expected results for this test string.
4002         mk.setText(testText);
4003         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4004         expectedBreaks[0] = 1;
4005         int32_t breakPos = 0;
4006         expectedCount = 0;
4007         for (;;) {
4008             breakPos = mk.next(breakPos);
4009             if (breakPos == -1) {
4010                 break;
4011             }
4012             if (breakPos > testText.length()) {
4013                 errln("breakPos > testText.length()");
4014             }
4015             expectedBreaks[breakPos] = 1;
4016             U_ASSERT(expectedCount<testText.length());
4017             expected[expectedCount ++] = breakPos;
4018             (void)expected;   // Set but not used warning.
4019                               // TODO (andy): check it out.
4020         }
4021
4022         // Find the break positions using forward iteration
4023         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4024         if (useUText) {
4025             UErrorCode status = U_ZERO_ERROR;
4026             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4027             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4028             bi->setText(testUText, status);
4029             TEST_ASSERT_SUCCESS(status);
4030             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4031                                       //  This UText can be closed immediately, so long as the
4032                                       //  testText string continues to exist.
4033         } else {
4034             bi->setText(testText);
4035         }
4036
4037         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4038             if (i < 0 || i > testText.length()) {
4039                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4040                 break;
4041             }
4042             forwardBreaks[i] = 1;
4043         }
4044
4045         // Find the break positions using reverse iteration
4046         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4047         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4048             if (i < 0 || i > testText.length()) {
4049                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4050                 break;
4051             }
4052             reverseBreaks[i] = 1;
4053         }
4054
4055         // Find the break positions using isBoundary() tests.
4056         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4057         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4058         for (i=0; i<=testText.length(); i++) {
4059             isBoundaryBreaks[i] = bi->isBoundary(i);
4060         }
4061
4062
4063         // Find the break positions using the following() function.
4064         // printf(".");
4065         memset(followingBreaks, 0, sizeof(followingBreaks));
4066         int32_t   lastBreakPos = 0;
4067         followingBreaks[0] = 1;
4068         for (i=0; i<testText.length(); i++) {
4069             breakPos = bi->following(i);
4070             if (breakPos <= i ||
4071                 breakPos < lastBreakPos ||
4072                 breakPos > testText.length() ||
4073                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4074                 UChar32 brkChar = testText.char32At(lastBreakPos);
4075                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4076                 errln("%s break monkey test: "
4077                     "Out of range value returned by BreakIterator::following().\n"
4078                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4079                          name, seed, i, breakPos, lastBreakPos);
4080                 }
4081                 break;
4082             }
4083             followingBreaks[breakPos] = 1;
4084             lastBreakPos = breakPos;
4085         }
4086
4087         // Find the break positions using the preceding() function.
4088         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4089         lastBreakPos = testText.length();
4090         precedingBreaks[testText.length()] = 1;
4091         for (i=testText.length(); i>0; i--) {
4092             breakPos = bi->preceding(i);
4093             if (breakPos >= i ||
4094                 breakPos > lastBreakPos ||
4095                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4096                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4097                 UChar32 brkChar = testText.char32At(breakPos);
4098                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4099                 errln("%s break monkey test: "
4100                     "Out of range value returned by BreakIterator::preceding().\n"
4101                     "index=%d;  prev returned %d; lastBreak=%d" ,
4102                     name,  i, breakPos, lastBreakPos);
4103                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4104                     precedingBreaks[i] = 2;   // Forces an error.
4105                 }
4106                 }
4107             } else {
4108                 if (breakPos >= 0) {
4109                     precedingBreaks[breakPos] = 1;
4110                 }
4111                 lastBreakPos = breakPos;
4112             }
4113         }
4114
4115         // Compare the expected and actual results.
4116         for (i=0; i<=testText.length(); i++) {
4117             const char *errorType = NULL;
4118             if  (forwardBreaks[i] != expectedBreaks[i]) {
4119                 errorType = "next()";
4120             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4121                 errorType = "previous()";
4122             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4123                 errorType = "isBoundary()";
4124             } else if (followingBreaks[i] != expectedBreaks[i]) {
4125                 errorType = "following()";
4126             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4127                 errorType = "preceding()";
4128             }
4129
4130
4131             if (errorType != NULL) {
4132                 // Format a range of the test text that includes the failure as
4133                 //  a data item that can be included in the rbbi test data file.
4134
4135                 // Start of the range is the last point where expected and actual results
4136                 //   both agreed that there was a break position.
4137                 int startContext = i;
4138                 int32_t count = 0;
4139                 for (;;) {
4140                     if (startContext==0) { break; }
4141                     startContext --;
4142                     if (expectedBreaks[startContext] != 0) {
4143                         if (count == 2) break;
4144                         count ++;
4145                     }
4146                 }
4147
4148                 // End of range is two expected breaks past the start position.
4149                 int endContext = i + 1;
4150                 int ci;
4151                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4152                     for (;;) {
4153                         if (endContext >= testText.length()) {break;}
4154                         if (expectedBreaks[endContext-1] != 0) {
4155                             if (count == 0) break;
4156                             count --;
4157                         }
4158                         endContext ++;
4159                     }
4160                 }
4161
4162                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4163                 UnicodeString errorText = "<data>";
4164                 /***if (strcmp(errorType, "next()") == 0) {
4165                     startContext = 0;
4166                     endContext = testText.length();
4167
4168                     printStringBreaks(testText, expected, expectedCount);
4169                 }***/
4170
4171                 for (ci=startContext; ci<endContext;) {
4172                     UnicodeString hexChars("0123456789abcdef");
4173                     UChar32  c;
4174                     int      bn;
4175                     c = testText.char32At(ci);
4176                     if (ci == i) {
4177                         // This is the location of the error.
4178                         errorText.append("<?>");
4179                     } else if (expectedBreaks[ci] != 0) {
4180                         // This a non-error expected break position.
4181                         errorText.append("\\");
4182                     }
4183                     if (c < 0x10000) {
4184                         errorText.append("\\u");
4185                         for (bn=12; bn>=0; bn-=4) {
4186                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4187                         }
4188                     } else {
4189                         errorText.append("\\U");
4190                         for (bn=28; bn>=0; bn-=4) {
4191                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4192                         }
4193                     }
4194                     ci = testText.moveIndex32(ci, 1);
4195                 }
4196                 errorText.append("\\");
4197                 errorText.append("</data>\n");
4198
4199                 // Output the error
4200                 char  charErrorTxt[500];
4201                 UErrorCode status = U_ZERO_ERROR;
4202                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4203                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4204                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4205
4206                 UChar32 brkChar = testText.char32At(i);
4207                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4208                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4209                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4210                     errorType, seed, i, charErrorTxt);
4211                 }
4212                 break;
4213             }
4214         }
4215
4216         loopCount++;
4217     }
4218 #endif
4219 }
4220
4221
4222 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4223 //             This test checks the initial patch,
4224 //             which is to just keep it from crashing.  Correct word boundaries
4225 //             await a proper fix to the dictionary code.
4226 //
4227 void RBBITest::TestBug5532(void)  {
4228    // Text includes a mixture of Thai and Latin.
4229    const unsigned char utf8Data[] = {
4230            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4231            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4232            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4233            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4234            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4235            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4236            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4237            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4238            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4239            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4240            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4241
4242     UErrorCode status = U_ZERO_ERROR;
4243     UText utext=UTEXT_INITIALIZER;
4244     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4245     TEST_ASSERT_SUCCESS(status);
4246
4247     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4248     TEST_ASSERT_SUCCESS(status);
4249     if (U_SUCCESS(status)) {
4250         bi->setText(&utext, status);
4251         TEST_ASSERT_SUCCESS(status);
4252
4253         int32_t breakCount = 0;
4254         int32_t previousBreak = -1;
4255         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4256             // For now, just make sure that the break iterator doesn't hang.
4257             TEST_ASSERT(previousBreak < bi->current());
4258             previousBreak = bi->current();
4259         }
4260         TEST_ASSERT(breakCount > 0);
4261     }
4262     delete bi;
4263     utext_close(&utext);
4264 }
4265
4266
4267 void RBBITest::TestBug9983(void)  {
4268     UnicodeString text = UnicodeString("\\u002A"  // * Other
4269                                        "\\uFF65"  //   Other
4270                                        "\\u309C"  //   Katakana
4271                                        "\\uFF9F"  //   Extend
4272                                        "\\uFF65"  //   Other
4273                                        "\\u0020"  //   Other
4274                                        "\\u0000").unescape();
4275
4276     UErrorCode status = U_ZERO_ERROR;
4277     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4278         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4279     TEST_ASSERT_SUCCESS(status);
4280     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4281         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4282     TEST_ASSERT_SUCCESS(status);
4283     if (U_FAILURE(status)) {
4284         return;
4285     }
4286     int32_t offset, rstatus, iterationCount;
4287
4288     brkiter->setText(text);
4289     brkiter->last();
4290     iterationCount = 0;
4291     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4292         iterationCount++;
4293         rstatus = brkiter->getRuleStatus();
4294         (void)rstatus;     // Suppress set but not used warning.
4295         if (iterationCount >= 10) {
4296            break;
4297         }
4298     }
4299     TEST_ASSERT(iterationCount == 6);
4300
4301     brkiterPOSIX->setText(text);
4302     brkiterPOSIX->last();
4303     iterationCount = 0;
4304     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4305         iterationCount++;
4306         rstatus = brkiterPOSIX->getRuleStatus();
4307         (void)rstatus;     // Suppress set but not used warning.
4308         if (iterationCount >= 10) {
4309            break;
4310         }
4311     }
4312     TEST_ASSERT(iterationCount == 6);
4313 }
4314
4315
4316 //
4317 //  TestDebug    -  A place-holder test for debugging purposes.
4318 //                  For putting in fragments of other tests that can be invoked
4319 //                  for tracing  without a lot of unwanted extra stuff happening.
4320 //
4321 void RBBITest::TestDebug(void) {
4322 #if 0
4323     UErrorCode   status = U_ZERO_ERROR;
4324     int pos = 0;
4325     int ruleStatus = 0;
4326
4327     RuleBasedBreakIterator* bi =
4328        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4329        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4330        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4331     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4332     // UnicodeString s("Aaa.  Bcd");
4333     s = s.unescape();
4334     bi->setText(s);
4335     UBool r = bi->isBoundary(8);
4336     printf("%s", r?"true":"false");
4337     return;
4338     pos = bi->last();
4339     do {
4340         // ruleStatus = bi->getRuleStatus();
4341         printf("%d\t%d\n", pos, ruleStatus);
4342         pos = bi->previous();
4343     } while (pos != BreakIterator::DONE);
4344 #endif
4345 }
4346
4347 void RBBITest::TestProperties() {
4348     UErrorCode errorCode = U_ZERO_ERROR;
4349     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4350     if (!prependSet.isEmpty()) {
4351         errln(
4352             "[:GCB=Prepend:] is not empty any more. "
4353             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4354             "change this test to the opposite condition.");
4355     }
4356 }
4357
4358 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */