icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2012, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include <typeinfo>  // for 'typeid' to work
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_BREAK_ITERATION
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/brkiter.h"
  20 #include "unicode/rbbi.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/utf16.h"
  23 #include "unicode/ucnv.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uniset.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27 #include "unicode/regex.h"
  28 #endif
  29 #include "unicode/ustring.h"
  30 #include "unicode/utext.h"
  31 #include "intltest.h"
  32 #include "rbbitst.h"
  33 #include <string.h>
  34 #include "uvector.h"
  35 #include "uvectr32.h"
  36 #include "triedict.h"
  37 #include <string.h>
  38 #include <stdio.h>
  39 #include <stdlib.h>
  40
  41 #define TEST_ASSERT(x) {if (!(x)) { \
  42     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  43
  44 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  45     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  46
  47
  48 //---------------------------------------------
  49 // runIndexedTest
  50 //---------------------------------------------
  51
  52
  53 //  Note:  Before adding new tests to this file, check whether the desired test data can
  54 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  55 //         it's much less work than writing a new test, diagnostic output in the event of failures
  56 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  57 //         will run there as well, without additional effort.
  58
  59 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  60 {
  61     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  62
  63     switch (index) {
  64 #if !UCONFIG_NO_FILE_IO
  65         case 0: name = "TestBug4153072";
  66             if(exec) TestBug4153072();                         break;
  67 #else
  68         case 0: name = "skip";
  69             break;
  70 #endif
  71
  72         case 1: name = "skip";
  73             break;
  74         case 2: name = "TestStatusReturn";
  75             if(exec) TestStatusReturn();                       break;
  76
  77 #if !UCONFIG_NO_FILE_IO
  78         case 3: name = "TestUnicodeFiles";
  79             if(exec) TestUnicodeFiles();                       break;
  80         case 4: name = "TestEmptyString";
  81             if(exec) TestEmptyString();                        break;
  82 #else
  83         case 3: case 4: name = "skip";
  84             break;
  85 #endif
  86
  87         case 5: name = "TestGetAvailableLocales";
  88             if(exec) TestGetAvailableLocales();                break;
  89
  90         case 6: name = "TestGetDisplayName";
  91             if(exec) TestGetDisplayName();                     break;
  92
  93 #if !UCONFIG_NO_FILE_IO
  94         case 7: name = "TestEndBehaviour";
  95             if(exec) TestEndBehaviour();                       break;
  96         case 8: case 9: case 10: name = "skip";
  97              break;
  98         case 11: name = "TestWordBreaks";
  99              if(exec) TestWordBreaks();                        break;
 100         case 12: name = "TestWordBoundary";
 101              if(exec) TestWordBoundary();                      break;
 102         case 13: name = "TestLineBreaks";
 103              if(exec) TestLineBreaks();                        break;
 104         case 14: name = "TestSentBreaks";
 105              if(exec) TestSentBreaks();                        break;
 106         case 15: name = "TestExtended";
 107              if(exec) TestExtended();                          break;
 108 #else
 109         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 110              break;
 111 #endif
 112
 113 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 114         case 16:  name = "TestMonkey";
 115             if(exec)  TestMonkey(params);                      break;
 116 #else
 117         case 16:
 118              name = "skip";                                    break;
 119 #endif
 120
 121 #if !UCONFIG_NO_FILE_IO
 122         case 17: name = "TestBug3818";
 123             if(exec) TestBug3818();                            break;
 124 #else
 125         case 17: name = "skip";
 126             break;
 127 #endif
 128
 129         case 18: name = "skip";
 130             break;
 131         case 19: name = "TestDebug";
 132             if(exec) TestDebug();                              break;
 133         case 20: name = "TestTrieDict";
 134             if(exec) TestTrieDict();                           break;
 135
 136 #if !UCONFIG_NO_FILE_IO
 137         case 21: name = "TestBug5775";
 138             if (exec) TestBug5775();                           break;
 139 #else
 140         case 21: name = "skip";
 141             break;
 142 #endif
 143
 144         case 22: name = "skip";
 145             break;
 146         case 23: name = "TestDictRules";
 147             if (exec) TestDictRules();                         break;
 148         case 24: name = "TestBug5532";
 149             if (exec) TestBug5532();                           break;
 150         default: name = ""; break; //needed to end loop
 151     }
 152 }
 153
 154
 155 //---------------------------------------------------------------------------
 156 //
 157 //   class BITestData   Holds a set of Break iterator test data and results
 158 //                      Includes
 159 //                         - the string data to be broken
 160 //                         - a vector of the expected break positions.
 161 //                         - a vector of source line numbers for the data,
 162 //                               (to help see where errors occured.)
 163 //                         - The expected break tag values.
 164 //                         - Vectors of actual break positions and tag values.
 165 //                         - Functions for comparing actual with expected and
 166 //                            reporting errors.
 167 //
 168 //----------------------------------------------------------------------------
 169 class BITestData {
 170 public:
 171     UnicodeString    fDataToBreak;
 172     UVector          fExpectedBreakPositions;
 173     UVector          fExpectedTags;
 174     UVector          fLineNum;
 175     UVector          fActualBreakPositions;   // Test Results.
 176     UVector          fActualTags;
 177
 178     BITestData(UErrorCode &status);
 179     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 180     void             checkResults(const char *heading, RBBITest *test);
 181     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 182     void             clearResults();
 183 };
 184
 185 //
 186 // Constructor.
 187 //
 188 BITestData::BITestData(UErrorCode &status)
 189 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 190   fActualTags(status)
 191 {
 192 }
 193
 194 //
 195 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 196 //                 The macro form collects the line number, which is helpful
 197 //                 when tracking down failures.
 198 //
 199 //                 A null data item is inserted at the start of each test's data
 200 //                  to put the starting zero into the data list.  The position saved for
 201 //                  each non-null item is its ending position.
 202 //
 203 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 204 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 205     if (U_FAILURE(status)) {return;}
 206     if (data != NULL) {
 207         fDataToBreak.append(CharsToUnicodeString(data));
 208     }
 209     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 210     fExpectedTags.addElement(tag, status);
 211     fLineNum.addElement(lineNum, status);
 212 }
 213
 214
 215 //
 216 //  checkResults.   Compare the actual and expected break positions, report any differences.
 217 //
 218 void BITestData::checkResults(const char *heading, RBBITest *test) {
 219     int32_t   expectedIndex = 0;
 220     int32_t   actualIndex = 0;
 221
 222     for (;;) {
 223         // If we've run through both the expected and actual results vectors, we're done.
 224         //   break out of the loop.
 225         if (expectedIndex >= fExpectedBreakPositions.size() &&
 226             actualIndex   >= fActualBreakPositions.size()) {
 227             break;
 228         }
 229
 230
 231         if (expectedIndex >= fExpectedBreakPositions.size()) {
 232             err(heading, test, expectedIndex-1, actualIndex);
 233             actualIndex++;
 234             continue;
 235         }
 236
 237         if (actualIndex >= fActualBreakPositions.size()) {
 238             err(heading, test, expectedIndex, actualIndex-1);
 239             expectedIndex++;
 240             continue;
 241         }
 242
 243         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 244             err(heading, test, expectedIndex, actualIndex);
 245             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 246             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 247                 actualIndex++;
 248             } else {
 249                 expectedIndex++;
 250             }
 251             continue;
 252         }
 253
 254         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 255             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 256                 heading, fLineNum.elementAt(expectedIndex),
 257                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 258         }
 259
 260         actualIndex++;
 261         expectedIndex++;
 262     }
 263 }
 264
 265 //
 266 //  err   -  An error was found.  Report it, along with information about where the
 267 //                                incorrectly broken test data appeared in the source file.
 268 //
 269 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 270 {
 271     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 272     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 273     int32_t   o        = 0;
 274     int32_t   line     = fLineNum.elementAti(expectedIdx);
 275     if (expectedIdx > 0) {
 276         // The line numbers are off by one because a premature break occurs somewhere
 277         //    within the previous item, rather than at the start of the current (expected) item.
 278         //    We want to report the offset of the unexpected break from the start of
 279         //      this previous item.
 280         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 281     }
 282     if (actual < expected) {
 283         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 284     } else {
 285         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 286     }
 287 }
 288
 289
 290 void BITestData::clearResults() {
 291     fActualBreakPositions.removeAllElements();
 292     fActualTags.removeAllElements();
 293 }
 294
 295
 296 //--------------------------------------------------------------------------------------
 297 //
 298 //    RBBITest    constructor and destructor
 299 //
 300 //--------------------------------------------------------------------------------------
 301
 302 RBBITest::RBBITest() {
 303 }
 304
 305
 306 RBBITest::~RBBITest() {
 307 }
 308
 309 //-----------------------------------------------------------------------------------
 310 //
 311 //   Test for status {tag} return value from break rules.
 312 //        TODO:  a more thorough test.
 313 //
 314 //-----------------------------------------------------------------------------------
 315 void RBBITest::TestStatusReturn() {
 316      UnicodeString rulesString1("$Letters = [:L:];\n"
 317                                   "$Numbers = [:N:];\n"
 318                                   "$Letters+{1};\n"
 319                                   "$Numbers+{2};\n"
 320                                   "Help\\ {4}/me\\!;\n"
 321                                   "[^$Letters $Numbers];\n"
 322                                   "!.*;\n", -1, US_INV);
 323      UnicodeString testString1  = "abc123..abc Help me Help me!";
 324                                 // 01234567890123456789012345678
 325      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 326      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 327
 328      UErrorCode status=U_ZERO_ERROR;
 329      UParseError    parseError;
 330
 331      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 332      if(U_FAILURE(status)) {
 333          dataerrln("FAIL : in construction - %s", u_errorName(status));
 334      } else {
 335          int32_t  pos;
 336          int32_t  i = 0;
 337          bi->setText(testString1);
 338          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 339              if (pos != bounds1[i]) {
 340                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 341                  break;
 342              }
 343
 344              int tag = bi->getRuleStatus();
 345              if (tag != brkStatus[i]) {
 346                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 347                  break;
 348              }
 349              i++;
 350          }
 351      }
 352      delete bi;
 353 }
 354
 355
 356 static void printStringBreaks(UnicodeString ustr, int expected[],
 357                               int expectedcount)
 358 {
 359     UErrorCode status = U_ZERO_ERROR;
 360     char name[100];
 361     printf("code    alpha extend alphanum type word sent line name\n");
 362     int j;
 363     for (j = 0; j < ustr.length(); j ++) {
 364         if (expectedcount > 0) {
 365             int k;
 366             for (k = 0; k < expectedcount; k ++) {
 367                 if (j == expected[k]) {
 368                     printf("------------------------------------------------ %d\n",
 369                            j);
 370                 }
 371             }
 372         }
 373         UChar32 c = ustr.char32At(j);
 374         if (c > 0xffff) {
 375             j ++;
 376         }
 377         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 378         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 379                            u_isUAlphabetic(c),
 380                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 381                            u_isalnum(c),
 382                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 383                                                   u_charType(c),
 384                                                   U_SHORT_PROPERTY_NAME),
 385                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 386                                                   u_getIntPropertyValue(c,
 387                                                           UCHAR_WORD_BREAK),
 388                                                   U_SHORT_PROPERTY_NAME),
 389                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 390                                    u_getIntPropertyValue(c,
 391                                            UCHAR_SENTENCE_BREAK),
 392                                    U_SHORT_PROPERTY_NAME),
 393                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 394                                    u_getIntPropertyValue(c,
 395                                            UCHAR_LINE_BREAK),
 396                                    U_SHORT_PROPERTY_NAME),
 397                            name);
 398     }
 399 }
 400
 401
 402 void RBBITest::TestBug3818() {
 403     UErrorCode  status = U_ZERO_ERROR;
 404
 405     // Four Thai words...
 406     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 407                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 408     UnicodeString  thaiStr(thaiWordData);
 409
 410     RuleBasedBreakIterator* bi =
 411         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 412     if (U_FAILURE(status) || bi == NULL) {
 413         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 414         return;
 415     }
 416     bi->setText(thaiStr);
 417
 418     int32_t  startOfSecondWord = bi->following(1);
 419     if (startOfSecondWord != 4) {
 420         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 421             __FILE__, __LINE__, startOfSecondWord);
 422     }
 423     startOfSecondWord = bi->following(0);
 424     if (startOfSecondWord != 4) {
 425         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 426             __FILE__, __LINE__, startOfSecondWord);
 427     }
 428     delete bi;
 429 }
 430
 431
 432 void RBBITest::TestTrieDict() {
 433     UErrorCode      status  = U_ZERO_ERROR;
 434
 435     //
 436     //  Open and read the test data file.
 437     //
 438     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 439     char testFileName[1000];
 440     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
 441         errln("Can't open test data.  Path too long.");
 442         return;
 443     }
 444     strcpy(testFileName, testDataDirectory);
 445     strcat(testFileName, "riwords.txt");
 446
 447     // Items needing deleting at the end
 448     MutableTrieDictionary *mutableDict = NULL;
 449     CompactTrieDictionary *compactDict = NULL;
 450     UnicodeSet            *breaks      = NULL;
 451     UChar                 *testFile    = NULL;
 452     StringEnumeration     *enumer1     = NULL;
 453     StringEnumeration     *enumer2     = NULL;
 454     MutableTrieDictionary *mutable2    = NULL;
 455     StringEnumeration     *cloneEnum   = NULL;
 456     CompactTrieDictionary *compact2    = NULL;
 457
 458
 459     const UnicodeString *originalWord = NULL;
 460     const UnicodeString *cloneWord    = NULL;
 461     UChar *current;
 462     UChar *word;
 463     UChar uc;
 464     int32_t wordLen;
 465     int32_t wordCount;
 466     int32_t testCount;
 467
 468     int    len;
 469     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
 470     if (U_FAILURE(status)) {
 471         goto cleanup; /* something went wrong, error already output */
 472     }
 473
 474     mutableDict = new MutableTrieDictionary(0x0E1C, status);
 475     if (U_FAILURE(status)) {
 476         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
 477         goto cleanup;
 478     }
 479
 480     breaks = new UnicodeSet;
 481     breaks->add(0x000A);     // Line Feed
 482     breaks->add(0x000D);     // Carriage Return
 483     breaks->add(0x2028);     // Line Separator
 484     breaks->add(0x2029);     // Paragraph Separator
 485
 486     // Now add each non-comment line of the file as a word.
 487     current = testFile;
 488     word = current;
 489     uc = *current++;
 490     wordLen = 0;
 491     wordCount = 0;
 492
 493     while (uc) {
 494         if (uc == 0x0023) {     // #comment line, skip
 495             while (uc && !breaks->contains(uc)) {
 496                 uc = *current++;
 497             }
 498         }
 499         else while (uc && !breaks->contains(uc)) {
 500             ++wordLen;
 501             uc = *current++;
 502         }
 503         if (wordLen > 0) {
 504             mutableDict->addWord(word, wordLen, status);
 505             if (U_FAILURE(status)) {
 506                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
 507                 goto cleanup;
 508             }
 509             wordCount += 1;
 510         }
 511
 512         // Find beginning of next line
 513         while (uc && breaks->contains(uc)) {
 514             uc = *current++;
 515         }
 516         word = current-1;
 517         wordLen = 0;
 518     }
 519
 520     if (wordCount < 50) {
 521         errln("Word count (%d) unreasonably small\n", wordCount);
 522         goto cleanup;
 523     }
 524
 525     enumer1 = mutableDict->openWords(status);
 526     if (U_FAILURE(status)) {
 527         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
 528         goto cleanup;
 529     }
 530
 531     testCount = 0;
 532     if (wordCount != (testCount = enumer1->count(status))) {
 533         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 534             testCount, wordCount, u_errorName(status));
 535         goto cleanup;
 536     }
 537
 538     // Now compact it
 539     compactDict = new CompactTrieDictionary(*mutableDict, status);
 540     if (U_FAILURE(status)) {
 541         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
 542         goto cleanup;
 543     }
 544
 545     enumer2 = compactDict->openWords(status);
 546     if (U_FAILURE(status)) {
 547         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
 548         goto cleanup;
 549     }
 550
 551     if (wordCount != (testCount = enumer2->count(status))) {
 552         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 553             testCount, wordCount, u_errorName(status));
 554         goto cleanup;
 555     }
 556
 557     if (typeid(*enumer1) == typeid(*enumer2)) {
 558         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
 559     }
 560     delete enumer1;
 561     enumer1 = NULL;
 562     delete enumer2;
 563     enumer2 = NULL;
 564
 565     // Now un-compact it
 566     mutable2 = compactDict->cloneMutable(status);
 567     if (U_FAILURE(status)) {
 568         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
 569         goto cleanup;
 570     }
 571
 572     cloneEnum = mutable2->openWords(status);
 573     if (U_FAILURE(status)) {
 574         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
 575         goto cleanup;
 576     }
 577
 578     if (wordCount != (testCount = cloneEnum->count(status))) {
 579         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 580             testCount, wordCount, u_errorName(status));
 581         goto cleanup;
 582     }
 583
 584     // Compact original dictionary to clone. Note that we can only compare the same kind of
 585     // dictionary as the order of the enumerators is not guaranteed to be the same between
 586     // different kinds
 587     enumer1 = mutableDict->openWords(status);
 588     if (U_FAILURE(status)) {
 589         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
 590         goto cleanup;
 591      }
 592
 593     originalWord = enumer1->snext(status);
 594     cloneWord = cloneEnum->snext(status);
 595     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 596         if (*originalWord != *cloneWord) {
 597             errln("Original and cloned MutableTrieDictionary word mismatch\n");
 598             goto cleanup;
 599         }
 600         originalWord = enumer1->snext(status);
 601         cloneWord = cloneEnum->snext(status);
 602     }
 603
 604     if (U_FAILURE(status)) {
 605         errln("Enumeration failed: %s\n", u_errorName(status));
 606         goto cleanup;
 607     }
 608
 609     if (originalWord != cloneWord) {
 610         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
 611         goto cleanup;
 612     }
 613
 614     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
 615     compact2 = new CompactTrieDictionary(compactDict->data(), status);
 616     if (U_FAILURE(status)) {
 617         errln("CompactTrieDictionary(const void *,...) failed\n");
 618         goto cleanup;
 619     }
 620
 621     if (compact2->dataSize() == 0) {
 622         errln("CompactTrieDictionary->dataSize() == 0\n");
 623         goto cleanup;
 624     }
 625
 626     // Now count the words via the second dictionary
 627     delete enumer1;
 628     enumer1 = compact2->openWords(status);
 629     if (U_FAILURE(status)) {
 630         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
 631         goto cleanup;
 632     }
 633
 634     if (wordCount != (testCount = enumer1->count(status))) {
 635         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 636             testCount, wordCount, u_errorName(status));
 637         goto cleanup;
 638     }
 639
 640 cleanup:
 641     delete compactDict;
 642     delete mutableDict;
 643     delete breaks;
 644     delete[] testFile;
 645     delete enumer1;
 646     delete mutable2;
 647     delete cloneEnum;
 648     delete compact2;
 649 }
 650
 651
 652 //----------------------------------------------------------------------------
 653 //
 654 // generalIteratorTest      Given a break iterator and a set of test data,
 655 //                          Run the tests and report the results.
 656 //
 657 //----------------------------------------------------------------------------
 658 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 659 {
 660
 661     bi.setText(td.fDataToBreak);
 662
 663     testFirstAndNext(bi, td);
 664
 665     testLastAndPrevious(bi, td);
 666
 667     testFollowing(bi, td);
 668     testPreceding(bi, td);
 669     testIsBoundary(bi, td);
 670     doMultipleSelectionTest(bi, td);
 671 }
 672
 673
 674 //
 675 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 676 //                       kind of loop.
 677 //
 678 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 679 {
 680     UErrorCode  status = U_ZERO_ERROR;
 681     int32_t     p;
 682     int32_t     lastP = -1;
 683     int32_t     tag;
 684
 685     logln("Test first and next");
 686     bi.setText(td.fDataToBreak);
 687     td.clearResults();
 688
 689     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 690         td.fActualBreakPositions.addElement(p, status);  // Save result.
 691         tag = bi.getRuleStatus();
 692         td.fActualTags.addElement(tag, status);
 693         if (p <= lastP) {
 694             // If the iterator is not making forward progress, stop.
 695             //  No need to raise an error here, it'll be detected in the normal check of results.
 696             break;
 697         }
 698         lastP = p;
 699     }
 700     td.checkResults("testFirstAndNext", this);
 701 }
 702
 703
 704 //
 705 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 706 //
 707 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 708 {
 709     UErrorCode  status = U_ZERO_ERROR;
 710     int32_t     p;
 711     int32_t     lastP  = 0x7ffffffe;
 712     int32_t     tag;
 713
 714     logln("Test last and previous");
 715     bi.setText(td.fDataToBreak);
 716     td.clearResults();
 717
 718     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 719         // Save break position.  Insert it at start of vector of results, shoving
 720         //    already-saved results further towards the end.
 721         td.fActualBreakPositions.insertElementAt(p, 0, status);
 722         // bi.previous();   // TODO:  Why does this fix things up????
 723         // bi.next();
 724         tag = bi.getRuleStatus();
 725         td.fActualTags.insertElementAt(tag, 0, status);
 726         if (p >= lastP) {
 727             // If the iterator is not making progress, stop.
 728             //  No need to raise an error here, it'll be detected in the normal check of results.
 729             break;
 730         }
 731         lastP = p;
 732     }
 733     td.checkResults("testLastAndPrevious", this);
 734 }
 735
 736
 737 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 738 {
 739     UErrorCode  status = U_ZERO_ERROR;
 740     int32_t     p;
 741     int32_t     tag;
 742     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 743                                  //   cannot be -1; that is returned for DONE.
 744     int         i;
 745
 746     logln("testFollowing():");
 747     bi.setText(td.fDataToBreak);
 748     td.clearResults();
 749
 750     // Save the starting point, since we won't get that out of following.
 751     p = bi.first();
 752     td.fActualBreakPositions.addElement(p, status);  // Save result.
 753     tag = bi.getRuleStatus();
 754     td.fActualTags.addElement(tag, status);
 755
 756     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 757         p = bi.following(i);
 758         if (p != lastP) {
 759             if (p == RuleBasedBreakIterator::DONE) {
 760                 break;
 761             }
 762             // We've reached a new break position.  Save it.
 763             td.fActualBreakPositions.addElement(p, status);  // Save result.
 764             tag = bi.getRuleStatus();
 765             td.fActualTags.addElement(tag, status);
 766             lastP = p;
 767         }
 768     }
 769     // The loop normally exits by means of the break in the middle.
 770     // Make sure that the index was at the correct position for the break iterator to have
 771     //   returned DONE.
 772     if (i != td.fDataToBreak.length()) {
 773         errln("testFollowing():  iterator returned DONE prematurely.");
 774     }
 775
 776     // Full check of all results.
 777     td.checkResults("testFollowing", this);
 778 }
 779
 780
 781
 782 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 783     UErrorCode  status = U_ZERO_ERROR;
 784     int32_t     p;
 785     int32_t     tag;
 786     int32_t     lastP  = 0x7ffffffe;
 787     int         i;
 788
 789     logln("testPreceding():");
 790     bi.setText(td.fDataToBreak);
 791     td.clearResults();
 792
 793     p = bi.last();
 794     td.fActualBreakPositions.addElement(p, status);
 795     tag = bi.getRuleStatus();
 796     td.fActualTags.addElement(tag, status);
 797
 798     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 799         p = bi.preceding(i);
 800         if (p != lastP) {
 801             if (p == RuleBasedBreakIterator::DONE) {
 802                 break;
 803             }
 804             // We've reached a new break position.  Save it.
 805             td.fActualBreakPositions.insertElementAt(p, 0, status);
 806             lastP = p;
 807             tag = bi.getRuleStatus();
 808             td.fActualTags.insertElementAt(tag, 0, status);
 809         }
 810     }
 811     // The loop normally exits by means of the break in the middle.
 812     // Make sure that the index was at the correct position for the break iterator to have
 813     //   returned DONE.
 814     if (i != 0) {
 815         errln("testPreceding():  iterator returned DONE prematurely.");
 816     }
 817
 818     // Full check of all results.
 819     td.checkResults("testPreceding", this);
 820 }
 821
 822
 823
 824 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 825     UErrorCode  status = U_ZERO_ERROR;
 826     int         i;
 827     int32_t     tag;
 828
 829     logln("testIsBoundary():");
 830     bi.setText(td.fDataToBreak);
 831     td.clearResults();
 832
 833     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 834         if (bi.isBoundary(i)) {
 835             td.fActualBreakPositions.addElement(i, status);  // Save result.
 836             tag = bi.getRuleStatus();
 837             td.fActualTags.addElement(tag, status);
 838         }
 839     }
 840     td.checkResults("testIsBoundary: ", this);
 841 }
 842
 843
 844
 845 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 846 {
 847     iterator.setText(td.fDataToBreak);
 848
 849     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 850     int32_t offset = iterator.first();
 851     int32_t testOffset;
 852     int32_t count = 0;
 853
 854     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 855
 856     if (*testIterator != iterator)
 857         errln("clone() or operator!= failed: two clones compared unequal");
 858
 859     do {
 860         testOffset = testIterator->first();
 861         testOffset = testIterator->next(count);
 862         if (offset != testOffset)
 863             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 864
 865         if (offset != RuleBasedBreakIterator::DONE) {
 866             count++;
 867             offset = iterator.next();
 868
 869             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 870                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 871                 if (count > 10000 || offset == -1) {
 872                     errln("operator== failed too many times. Stopping test.");
 873                     if (offset == -1) {
 874                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 875                     }
 876                     return;
 877                 }
 878             }
 879         }
 880     } while (offset != RuleBasedBreakIterator::DONE);
 881
 882     // now do it backwards...
 883     offset = iterator.last();
 884     count = 0;
 885
 886     do {
 887         testOffset = testIterator->last();
 888         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 889         if (offset != testOffset)
 890             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 891
 892         if (offset != RuleBasedBreakIterator::DONE) {
 893             count--;
 894             offset = iterator.previous();
 895         }
 896     } while (offset != RuleBasedBreakIterator::DONE);
 897
 898     delete testIterator;
 899 }
 900
 901
 902 //---------------------------------------------
 903 //
 904 //     other tests
 905 //
 906 //---------------------------------------------
 907 void RBBITest::TestEmptyString()
 908 {
 909     UnicodeString text = "";
 910     UErrorCode status = U_ZERO_ERROR;
 911
 912     BITestData x(status);
 913     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 914     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 915     if (U_FAILURE(status))
 916     {
 917         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 918         return;
 919     }
 920     generalIteratorTest(*bi, x);
 921     delete bi;
 922 }
 923
 924 void RBBITest::TestGetAvailableLocales()
 925 {
 926     int32_t locCount = 0;
 927     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 928
 929     if (locCount == 0)
 930         dataerrln("getAvailableLocales() returned an empty list!");
 931     // Just make sure that it's returning good memory.
 932     int32_t i;
 933     for (i = 0; i < locCount; ++i) {
 934         logln(locList[i].getName());
 935     }
 936 }
 937
 938 //Testing the BreakIterator::getDisplayName() function
 939 void RBBITest::TestGetDisplayName()
 940 {
 941     UnicodeString   result;
 942
 943     BreakIterator::getDisplayName(Locale::getUS(), result);
 944     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 945         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 946                 + result);
 947
 948     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 949     if (result != "French (France)")
 950         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 951                 + result);
 952 }
 953 /**
 954  * Test End Behaviour
 955  * @bug 4068137
 956  */
 957 void RBBITest::TestEndBehaviour()
 958 {
 959     UErrorCode status = U_ZERO_ERROR;
 960     UnicodeString testString("boo.");
 961     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 962     if (U_FAILURE(status))
 963     {
 964         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 965         return;
 966     }
 967     wb->setText(testString);
 968
 969     if (wb->first() != 0)
 970         errln("Didn't get break at beginning of string.");
 971     if (wb->next() != 3)
 972         errln("Didn't get break before period in \"boo.\"");
 973     if (wb->current() != 4 && wb->next() != 4)
 974         errln("Didn't get break at end of string.");
 975     delete wb;
 976 }
 977 /*
 978  * @bug 4153072
 979  */
 980 void RBBITest::TestBug4153072() {
 981     UErrorCode status = U_ZERO_ERROR;
 982     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 983     if (U_FAILURE(status))
 984     {
 985         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 986         return;
 987     }
 988     UnicodeString str("...Hello, World!...");
 989     int32_t begin = 3;
 990     int32_t end = str.length() - 3;
 991     UBool onBoundary;
 992
 993     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 994     iter->adoptText(textIterator);
 995     int index;
 996     // Note: with the switch to UText, there is no way to restrict the
 997     //       iteration range to begin at an index other than zero.
 998     //       String character iterators created with a non-zero bound are
 999     //         treated by RBBI as being empty.
1000     for (index = -1; index < begin + 1; ++index) {
1001         onBoundary = iter->isBoundary(index);
1002         if (index == 0?  !onBoundary : onBoundary) {
1003             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1004                             " and begin index = " + begin);
1005         }
1006     }
1007     delete iter;
1008 }
1009
1010
1011 //
1012 // Test for problem reported by Ashok Matoria on 9 July 2007
1013 //    One.<kSoftHyphen><kSpace>Two.
1014 //
1015 //    Sentence break at start (0) and then on calling next() it breaks at
1016 //   'T' of "Two". Now, at this point if I do next() and
1017 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1018 //
1019 void RBBITest::TestBug5775() {
1020     UErrorCode status = U_ZERO_ERROR;
1021     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1022     TEST_ASSERT_SUCCESS(status);
1023     if (U_FAILURE(status)) {
1024         return;
1025     }
1026 // Check for status first for better handling of no data errors.
1027     TEST_ASSERT(bi != NULL);
1028     if (bi == NULL) {
1029         return;
1030     }
1031
1032     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1033     //               01234      56789
1034     s = s.unescape();
1035     bi->setText(s);
1036     int pos = bi->next();
1037     TEST_ASSERT(pos == 6);
1038     pos = bi->next();
1039     TEST_ASSERT(pos == 10);
1040     pos = bi->previous();
1041     TEST_ASSERT(pos == 6);
1042     delete bi;
1043 }
1044
1045
1046
1047 //------------------------------------------------------------------------------
1048 //
1049 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1050 //
1051 //------------------------------------------------------------------------------
1052
1053 struct TestParams {
1054     BreakIterator   *bi;
1055     UnicodeString    dataToBreak;
1056     UVector32       *expectedBreaks;
1057     UVector32       *srcLine;
1058     UVector32       *srcCol;
1059 };
1060
1061 void RBBITest::executeTest(TestParams *t) {
1062     int32_t    bp;
1063     int32_t    prevBP;
1064     int32_t    i;
1065
1066     if (t->bi == NULL) {
1067         return;
1068     }
1069
1070     t->bi->setText(t->dataToBreak);
1071     //
1072     //  Run the iterator forward
1073     //
1074     prevBP = -1;
1075     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1076         if (prevBP ==  bp) {
1077             // Fail for lack of forward progress.
1078             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1079                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1080             break;
1081         }
1082
1083         // Check that there were we didn't miss an expected break between the last one
1084         //  and this one.
1085         for (i=prevBP+1; i<bp; i++) {
1086             if (t->expectedBreaks->elementAti(i) != 0) {
1087                 int expected[] = {0, i};
1088                 printStringBreaks(t->dataToBreak, expected, 2);
1089                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1090                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1091             }
1092         }
1093
1094         // Check that the break we did find was expected
1095         if (t->expectedBreaks->elementAti(bp) == 0) {
1096             int expected[] = {0, bp};
1097             printStringBreaks(t->dataToBreak, expected, 2);
1098             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1099                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1100         } else {
1101             // The break was expected.
1102             //   Check that the {nnn} tag value is correct.
1103             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1104             if (expectedTagVal == -1) {
1105                 expectedTagVal = 0;
1106             }
1107             int32_t line = t->srcLine->elementAti(bp);
1108             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1109             if (rs != expectedTagVal) {
1110                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1111                       "          Actual, Expected status = %4d, %4d",
1112                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1113             }
1114         }
1115
1116
1117         prevBP = bp;
1118     }
1119
1120     // Verify that there were no missed expected breaks after the last one found
1121     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1122         if (t->expectedBreaks->elementAti(i) != 0) {
1123             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1124                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1125         }
1126     }
1127
1128     //
1129     //  Run the iterator backwards, verify that the same breaks are found.
1130     //
1131     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1132     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1133         if (prevBP ==  bp) {
1134             // Fail for lack of progress.
1135             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1136                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1137             break;
1138         }
1139
1140         // Check that there were we didn't miss an expected break between the last one
1141         //  and this one.  (UVector returns zeros for index out of bounds.)
1142         for (i=prevBP-1; i>bp; i--) {
1143             if (t->expectedBreaks->elementAti(i) != 0) {
1144                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1145                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1146             }
1147         }
1148
1149         // Check that the break we did find was expected
1150         if (t->expectedBreaks->elementAti(bp) == 0) {
1151             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1152                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1153         } else {
1154             // The break was expected.
1155             //   Check that the {nnn} tag value is correct.
1156             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1157             if (expectedTagVal == -1) {
1158                 expectedTagVal = 0;
1159             }
1160             int line = t->srcLine->elementAti(bp);
1161             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1162             if (rs != expectedTagVal) {
1163                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1164                       "          Actual, Expected status = %4d, %4d",
1165                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1166             }
1167         }
1168
1169         prevBP = bp;
1170     }
1171
1172     // Verify that there were no missed breaks prior to the last one found
1173     for (i=prevBP-1; i>=0; i--) {
1174         if (t->expectedBreaks->elementAti(i) != 0) {
1175             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1176                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1177         }
1178     }
1179 }
1180
1181
1182 void RBBITest::TestExtended() {
1183 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1184     UErrorCode      status  = U_ZERO_ERROR;
1185     Locale          locale("");
1186
1187     UnicodeString       rules;
1188     TestParams          tp;
1189     tp.bi             = NULL;
1190     tp.expectedBreaks = new UVector32(status);
1191     tp.srcLine        = new UVector32(status);
1192     tp.srcCol         = new UVector32(status);
1193
1194     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1195     if (U_FAILURE(status)) {
1196         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1197     }
1198
1199
1200     //
1201     //  Open and read the test data file.
1202     //
1203     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1204     char testFileName[1000];
1205     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1206         errln("Can't open test data.  Path too long.");
1207         return;
1208     }
1209     strcpy(testFileName, testDataDirectory);
1210     strcat(testFileName, "rbbitst.txt");
1211
1212     int    len;
1213     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1214     if (U_FAILURE(status)) {
1215         return; /* something went wrong, error already output */
1216     }
1217
1218
1219
1220
1221     //
1222     //  Put the test data into a UnicodeString
1223     //
1224     UnicodeString testString(FALSE, testFile, len);
1225
1226     enum EParseState{
1227         PARSE_COMMENT,
1228         PARSE_TAG,
1229         PARSE_DATA,
1230         PARSE_NUM
1231     }
1232     parseState = PARSE_TAG;
1233
1234     EParseState savedState = PARSE_TAG;
1235
1236     static const UChar CH_LF        = 0x0a;
1237     static const UChar CH_CR        = 0x0d;
1238     static const UChar CH_HASH      = 0x23;
1239     /*static const UChar CH_PERIOD    = 0x2e;*/
1240     static const UChar CH_LT        = 0x3c;
1241     static const UChar CH_GT        = 0x3e;
1242     static const UChar CH_BACKSLASH = 0x5c;
1243     static const UChar CH_BULLET    = 0x2022;
1244
1245     int32_t    lineNum  = 1;
1246     int32_t    colStart = 0;
1247     int32_t    column   = 0;
1248     int32_t    charIdx  = 0;
1249
1250     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1251
1252     for (charIdx = 0; charIdx < len; ) {
1253         status = U_ZERO_ERROR;
1254         UChar  c = testString.charAt(charIdx);
1255         charIdx++;
1256         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1257             // treat CRLF as a unit
1258             c = CH_LF;
1259             charIdx++;
1260         }
1261         if (c == CH_LF || c == CH_CR) {
1262             lineNum++;
1263             colStart = charIdx;
1264         }
1265         column = charIdx - colStart + 1;
1266
1267         switch (parseState) {
1268         case PARSE_COMMENT:
1269             if (c == 0x0a || c == 0x0d) {
1270                 parseState = savedState;
1271             }
1272             break;
1273
1274         case PARSE_TAG:
1275             {
1276             if (c == CH_HASH) {
1277                 parseState = PARSE_COMMENT;
1278                 savedState = PARSE_TAG;
1279                 break;
1280             }
1281             if (u_isUWhiteSpace(c)) {
1282                 break;
1283             }
1284             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1285                 delete tp.bi;
1286                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1287                 charIdx += 5;
1288                 break;
1289             }
1290             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1291                 delete tp.bi;
1292                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1293                 charIdx += 5;
1294                 break;
1295             }
1296             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1297                 delete tp.bi;
1298                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1299                 charIdx += 5;
1300                 break;
1301             }
1302             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1303                 delete tp.bi;
1304                 tp.bi = NULL;
1305                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1306                 charIdx += 5;
1307                 break;
1308             }
1309             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1310                 delete tp.bi;
1311                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1312                 charIdx += 6;
1313                 break;
1314             }
1315
1316             // <locale  loc_name>
1317             localeMatcher.reset(testString);
1318             if (localeMatcher.lookingAt(charIdx-1, status)) {
1319                 UnicodeString localeName = localeMatcher.group(1, status);
1320                 char localeName8[100];
1321                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1322                 locale = Locale::createFromName(localeName8);
1323                 charIdx += localeMatcher.group(0, status).length();
1324                 TEST_ASSERT_SUCCESS(status);
1325                 break;
1326             }
1327             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1328                 parseState = PARSE_DATA;
1329                 charIdx += 5;
1330                 tp.dataToBreak = "";
1331                 tp.expectedBreaks->removeAllElements();
1332                 tp.srcCol ->removeAllElements();
1333                 tp.srcLine->removeAllElements();
1334                 break;
1335             }
1336
1337             errln("line %d: Tag expected in test file.", lineNum);
1338             parseState = PARSE_COMMENT;
1339             savedState = PARSE_DATA;
1340             goto end_test; // Stop the test.
1341             }
1342             break;
1343
1344         case PARSE_DATA:
1345             if (c == CH_BULLET) {
1346                 int32_t  breakIdx = tp.dataToBreak.length();
1347                 tp.expectedBreaks->setSize(breakIdx+1);
1348                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1349                 tp.srcLine->setSize(breakIdx+1);
1350                 tp.srcLine->setElementAt(lineNum, breakIdx);
1351                 tp.srcCol ->setSize(breakIdx+1);
1352                 tp.srcCol ->setElementAt(column, breakIdx);
1353                 break;
1354             }
1355
1356             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1357                 // Add final entry to mappings from break location to source file position.
1358                 //  Need one extra because last break position returned is after the
1359                 //    last char in the data, not at the last char.
1360                 tp.srcLine->addElement(lineNum, status);
1361                 tp.srcCol ->addElement(column, status);
1362
1363                 parseState = PARSE_TAG;
1364                 charIdx += 6;
1365
1366                 // RUN THE TEST!
1367                 executeTest(&tp);
1368                 break;
1369             }
1370
1371             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1372                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1373                 // Get the code point from the name and insert it into the test data.
1374                 //   (Damn, no API takes names in Unicode  !!!
1375                 //    we've got to take it back to char *)
1376                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1377                 int32_t nameLength = nameEndIdx - (charIdx+2);
1378                 char charNameBuf[200];
1379                 UChar32 theChar = -1;
1380                 if (nameEndIdx != -1) {
1381                     UErrorCode status = U_ZERO_ERROR;
1382                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1383                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1384                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1385                     if (U_FAILURE(status)) {
1386                         theChar = -1;
1387                     }
1388                 }
1389                 if (theChar == -1) {
1390                     errln("Error in named character in test file at line %d, col %d",
1391                         lineNum, column);
1392                 } else {
1393                     // Named code point was recognized.  Insert it
1394                     //   into the test data.
1395                     tp.dataToBreak.append(theChar);
1396                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1397                         tp.srcLine->addElement(lineNum, status);
1398                         tp.srcCol ->addElement(column, status);
1399                     }
1400                 }
1401                 if (nameEndIdx > charIdx) {
1402                     charIdx = nameEndIdx+1;
1403
1404                 }
1405                 break;
1406             }
1407
1408
1409
1410
1411             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1412                 charIdx++;
1413                 int32_t  breakIdx = tp.dataToBreak.length();
1414                 tp.expectedBreaks->setSize(breakIdx+1);
1415                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1416                 tp.srcLine->setSize(breakIdx+1);
1417                 tp.srcLine->setElementAt(lineNum, breakIdx);
1418                 tp.srcCol ->setSize(breakIdx+1);
1419                 tp.srcCol ->setElementAt(column, breakIdx);
1420                 break;
1421             }
1422
1423             if (c == CH_LT) {
1424                 tagValue   = 0;
1425                 parseState = PARSE_NUM;
1426                 break;
1427             }
1428
1429             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1430                 parseState = PARSE_COMMENT;
1431                 savedState = PARSE_DATA;
1432                 break;
1433             }
1434
1435             if (c == CH_BACKSLASH) {
1436                 // Check for \ at end of line, a line continuation.
1437                 //     Advance over (discard) the newline
1438                 UChar32 cp = testString.char32At(charIdx);
1439                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1440                     // We have a CR LF
1441                     //  Need an extra increment of the input ptr to move over both of them
1442                     charIdx++;
1443                 }
1444                 if (cp == CH_LF || cp == CH_CR) {
1445                     lineNum++;
1446                     colStart = charIdx;
1447                     charIdx++;
1448                     break;
1449                 }
1450
1451                 // Let unescape handle the back slash.
1452                 cp = testString.unescapeAt(charIdx);
1453                 if (cp != -1) {
1454                     // Escape sequence was recognized.  Insert the char
1455                     //   into the test data.
1456                     tp.dataToBreak.append(cp);
1457                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1458                         tp.srcLine->addElement(lineNum, status);
1459                         tp.srcCol ->addElement(column, status);
1460                     }
1461                     break;
1462                 }
1463
1464
1465                 // Not a recognized backslash escape sequence.
1466                 // Take the next char as a literal.
1467                 //  TODO:  Should this be an error?
1468                 c = testString.charAt(charIdx);
1469                 charIdx = testString.moveIndex32(charIdx, 1);
1470             }
1471
1472             // Normal, non-escaped data char.
1473             tp.dataToBreak.append(c);
1474
1475             // Save the mapping from offset in the data to line/column numbers in
1476             //   the original input file.  Will be used for better error messages only.
1477             //   If there's an expected break before this char, the slot in the mapping
1478             //     vector will already be set for this char; don't overwrite it.
1479             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1480                 tp.srcLine->addElement(lineNum, status);
1481                 tp.srcCol ->addElement(column, status);
1482             }
1483             break;
1484
1485
1486         case PARSE_NUM:
1487             // We are parsing an expected numeric tag value, like <1234>,
1488             //   within a chunk of data.
1489             if (u_isUWhiteSpace(c)) {
1490                 break;
1491             }
1492
1493             if (c == CH_GT) {
1494                 // Finished the number.  Add the info to the expected break data,
1495                 //   and switch parse state back to doing plain data.
1496                 parseState = PARSE_DATA;
1497                 if (tagValue == 0) {
1498                     tagValue = -1;
1499                 }
1500                 int32_t  breakIdx = tp.dataToBreak.length();
1501                 tp.expectedBreaks->setSize(breakIdx+1);
1502                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1503                 tp.srcLine->setSize(breakIdx+1);
1504                 tp.srcLine->setElementAt(lineNum, breakIdx);
1505                 tp.srcCol ->setSize(breakIdx+1);
1506                 tp.srcCol ->setElementAt(column, breakIdx);
1507                 break;
1508             }
1509
1510             if (u_isdigit(c)) {
1511                 tagValue = tagValue*10 + u_charDigitValue(c);
1512                 break;
1513             }
1514
1515             errln("Syntax Error in test file at line %d, col %d",
1516                 lineNum, column);
1517             parseState = PARSE_COMMENT;
1518             goto end_test; // Stop the test
1519             break;
1520         }
1521
1522
1523         if (U_FAILURE(status)) {
1524             dataerrln("ICU Error %s while parsing test file at line %d.",
1525                 u_errorName(status), lineNum);
1526             status = U_ZERO_ERROR;
1527             goto end_test; // Stop the test
1528         }
1529
1530     }
1531
1532 end_test:
1533     delete tp.bi;
1534     delete tp.expectedBreaks;
1535     delete tp.srcLine;
1536     delete tp.srcCol;
1537     delete [] testFile;
1538 #endif
1539 }
1540
1541
1542 //-------------------------------------------------------------------------------
1543 //
1544 //  TestDictRules   create a break iterator from source rules that includes a
1545 //                  dictionary range.   Regression for bug #7130.  Source rules
1546 //                  do not declare a break iterator type (word, line, sentence, etc.
1547 //                  but the dictionary code, without a type, would loop.
1548 //
1549 //-------------------------------------------------------------------------------
1550 void RBBITest::TestDictRules() {
1551     const char *rules =  "$dictionary = [a-z]; \n"
1552                          "!!forward; \n"
1553                          "$dictionary $dictionary; \n"
1554                          "!!reverse; \n"
1555                          "$dictionary $dictionary; \n";
1556     const char *text = "aa";
1557     UErrorCode status = U_ZERO_ERROR;
1558     UParseError parseError;
1559
1560     RuleBasedBreakIterator bi(rules, parseError, status);
1561     if (U_SUCCESS(status)) {
1562         UnicodeString utext = text;
1563         bi.setText(utext);
1564         int32_t position;
1565         int32_t loops;
1566         for (loops = 0; loops<10; loops++) {
1567             position = bi.next();
1568             if (position == RuleBasedBreakIterator::DONE) {
1569                 break;
1570             }
1571         }
1572         TEST_ASSERT(loops == 1);
1573     } else {
1574         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1575     }
1576 }
1577
1578
1579
1580 //-------------------------------------------------------------------------------
1581 //
1582 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1583 //    return the datain one big UChar * buffer, which the caller must delete.
1584 //
1585 //    parameters:
1586 //          fileName:   the name of the file, with no directory part.  The test data directory
1587 //                      is assumed.
1588 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1589 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1590 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1591 //                      Pass NULL for the system default encoding.
1592 //          status
1593 //    returns:
1594 //                      The file data, converted to UChar.
1595 //                      The caller must delete this when done with
1596 //                           delete [] theBuffer;
1597 //
1598 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1599 //           Move this function to some common place.
1600 //
1601 //--------------------------------------------------------------------------------
1602 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1603     UChar       *retPtr  = NULL;
1604     char        *fileBuf = NULL;
1605     UConverter* conv     = NULL;
1606     FILE        *f       = NULL;
1607
1608     ulen = 0;
1609     if (U_FAILURE(status)) {
1610         return retPtr;
1611     }
1612
1613     //
1614     //  Open the file.
1615     //
1616     f = fopen(fileName, "rb");
1617     if (f == 0) {
1618         dataerrln("Error opening test data file %s\n", fileName);
1619         status = U_FILE_ACCESS_ERROR;
1620         return NULL;
1621     }
1622     //
1623     //  Read it in
1624     //
1625     int   fileSize;
1626     int   amt_read;
1627
1628     fseek( f, 0, SEEK_END);
1629     fileSize = ftell(f);
1630     fileBuf = new char[fileSize];
1631     fseek(f, 0, SEEK_SET);
1632     amt_read = fread(fileBuf, 1, fileSize, f);
1633     if (amt_read != fileSize || fileSize <= 0) {
1634         errln("Error reading test data file.");
1635         goto cleanUpAndReturn;
1636     }
1637
1638     //
1639     // Look for a Unicode Signature (BOM) on the data just read
1640     //
1641     int32_t        signatureLength;
1642     const char *   fileBufC;
1643     const char*    bomEncoding;
1644
1645     fileBufC = fileBuf;
1646     bomEncoding = ucnv_detectUnicodeSignature(
1647         fileBuf, fileSize, &signatureLength, &status);
1648     if(bomEncoding!=NULL ){
1649         fileBufC  += signatureLength;
1650         fileSize  -= signatureLength;
1651         encoding = bomEncoding;
1652     }
1653
1654     //
1655     // Open a converter to take the rule file to UTF-16
1656     //
1657     conv = ucnv_open(encoding, &status);
1658     if (U_FAILURE(status)) {
1659         goto cleanUpAndReturn;
1660     }
1661
1662     //
1663     // Convert the rules to UChar.
1664     //  Preflight first to determine required buffer size.
1665     //
1666     ulen = ucnv_toUChars(conv,
1667         NULL,           //  dest,
1668         0,              //  destCapacity,
1669         fileBufC,
1670         fileSize,
1671         &status);
1672     if (status == U_BUFFER_OVERFLOW_ERROR) {
1673         // Buffer Overflow is expected from the preflight operation.
1674         status = U_ZERO_ERROR;
1675
1676         retPtr = new UChar[ulen+1];
1677         ucnv_toUChars(conv,
1678             retPtr,       //  dest,
1679             ulen+1,
1680             fileBufC,
1681             fileSize,
1682             &status);
1683     }
1684
1685 cleanUpAndReturn:
1686     fclose(f);
1687     delete []fileBuf;
1688     ucnv_close(conv);
1689     if (U_FAILURE(status)) {
1690         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1691         delete []retPtr;
1692         retPtr = 0;
1693         ulen   = 0;
1694     };
1695     return retPtr;
1696 }
1697
1698
1699
1700 //--------------------------------------------------------------------------------------------
1701 //
1702 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1703 //
1704 //-------------------------------------------------------------------------------------------
1705 void RBBITest::TestUnicodeFiles() {
1706     RuleBasedBreakIterator  *bi;
1707     UErrorCode               status = U_ZERO_ERROR;
1708
1709     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1710     TEST_ASSERT_SUCCESS(status);
1711     if (U_SUCCESS(status)) {
1712         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1713     }
1714     delete bi;
1715
1716     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1717     TEST_ASSERT_SUCCESS(status);
1718     if (U_SUCCESS(status)) {
1719         runUnicodeTestData("WordBreakTest.txt", bi);
1720     }
1721     delete bi;
1722
1723     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1724     TEST_ASSERT_SUCCESS(status);
1725     if (U_SUCCESS(status)) {
1726         runUnicodeTestData("SentenceBreakTest.txt", bi);
1727     }
1728     delete bi;
1729
1730     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1731     TEST_ASSERT_SUCCESS(status);
1732     if (U_SUCCESS(status)) {
1733         runUnicodeTestData("LineBreakTest.txt", bi);
1734     }
1735     delete bi;
1736 }
1737
1738
1739 //--------------------------------------------------------------------------------------------
1740 //
1741 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1742 //
1743 //-------------------------------------------------------------------------------------------
1744 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1745 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1746     // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb. Ticket #7270
1747     UBool isTicket7270Fixed = isICUVersionAtLeast(50, 0);
1748     UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
1749     UErrorCode  status = U_ZERO_ERROR;
1750
1751     //
1752     //  Open and read the test data file, put it into a UnicodeString.
1753     //
1754     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1755     char testFileName[1000];
1756     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1757         dataerrln("Can't open test data.  Path too long.");
1758         return;
1759     }
1760     strcpy(testFileName, testDataDirectory);
1761     strcat(testFileName, fileName);
1762
1763     logln("Opening data file %s\n", fileName);
1764
1765     int    len;
1766     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1767     if (status != U_FILE_ACCESS_ERROR) {
1768         TEST_ASSERT_SUCCESS(status);
1769         TEST_ASSERT(testFile != NULL);
1770     }
1771     if (U_FAILURE(status) || testFile == NULL) {
1772         return; /* something went wrong, error already output */
1773     }
1774     UnicodeString testFileAsString(TRUE, testFile, len);
1775
1776     //
1777     //  Parse the test data file using a regular expression.
1778     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1779     //     is identified by which group had a match.
1780     //
1781     //    Caputure Group #                  1          2            3            4           5
1782     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1783     //
1784     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1785     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1786     UnicodeString   testString;
1787     UVector32       breakPositions(status);
1788     int             lineNumber = 1;
1789     TEST_ASSERT_SUCCESS(status);
1790     if (U_FAILURE(status)) {
1791         return;
1792     }
1793
1794     //
1795     //  Scan through each test case, building up the string to be broken in testString,
1796     //   and the positions that should be boundaries in the breakPositions vector.
1797     //
1798     int spin = 0;
1799     while (tokenMatcher.find()) {
1800         if(tokenMatcher.hitEnd()) {
1801           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1802              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1803              and caused an infinite loop here on EBCDIC systems!
1804           */
1805           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1806           //       return;
1807         }
1808         if (tokenMatcher.start(1, status) >= 0) {
1809             // Scanned a divide sign, indicating a break position in the test data.
1810             if (testString.length()>0) {
1811                 breakPositions.addElement(testString.length(), status);
1812             }
1813         }
1814         else if (tokenMatcher.start(2, status) >= 0) {
1815             // Scanned an 'x', meaning no break at this position in the test data
1816             //   Nothing to be done here.
1817             }
1818         else if (tokenMatcher.start(3, status) >= 0) {
1819             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1820             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1821             int length = hexNumber.length();
1822             if (length<=8) {
1823                 char buf[10];
1824                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1825                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1826                 if (c<=0x10ffff) {
1827                     testString.append(c);
1828                 } else {
1829                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1830                        fileName, lineNumber);
1831                 }
1832             } else {
1833                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1834                        fileName, lineNumber);
1835              }
1836         }
1837         else if (tokenMatcher.start(4, status) >= 0) {
1838             // Scanned to end of a line, possibly skipping over a comment in the process.
1839             //   If the line from the file contained test data, run the test now.
1840             //
1841             if (testString.length() > 0) {
1842 // TODO(andy): Remove this time bomb code. Note: Line range updated for Unicode 6.1 LineBreakTest.txt.
1843 if (!isLineBreak || isTicket7270Fixed || !(5066 <= lineNumber && lineNumber <= 5170)) {
1844                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1845 }
1846             }
1847
1848             // Clear out this test case.
1849             //    The string and breakPositions vector will be refilled as the next
1850             //       test case is parsed.
1851             testString.remove();
1852             breakPositions.removeAllElements();
1853             lineNumber++;
1854         } else {
1855             // Scanner catchall.  Something unrecognized appeared on the line.
1856             char token[16];
1857             UnicodeString uToken = tokenMatcher.group(0, status);
1858             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1859             token[sizeof(token)-1] = 0;
1860             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1861
1862             // Clean up, in preparation for continuing with the next line.
1863             testString.remove();
1864             breakPositions.removeAllElements();
1865             lineNumber++;
1866         }
1867         TEST_ASSERT_SUCCESS(status);
1868         if (U_FAILURE(status)) {
1869             break;
1870         }
1871     }
1872
1873     delete [] testFile;
1874  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1875 }
1876
1877 //--------------------------------------------------------------------------------------------
1878 //
1879 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1880 //                            test data files.  Do only a simple, forward-only check -
1881 //                            this test is mostly to check that ICU and the Unicode
1882 //                            data agree with each other.
1883 //
1884 //--------------------------------------------------------------------------------------------
1885 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1886                          const UnicodeString &testString,   // Text data to be broken
1887                          UVector32 *breakPositions,         // Positions where breaks should be found.
1888                          RuleBasedBreakIterator *bi) {
1889     int32_t pos;                 // Break Position in the test string
1890     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1891     int32_t expectedPos;         // Expected break position (index into test string)
1892
1893     bi->setText(testString);
1894     pos = bi->first();
1895     pos = bi->next();
1896
1897     while (pos != BreakIterator::DONE) {
1898         if (expectedI >= breakPositions->size()) {
1899             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1900                 testFileName, lineNumber, pos);
1901             break;
1902         }
1903         expectedPos = breakPositions->elementAti(expectedI);
1904         if (pos < expectedPos) {
1905             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1906                 testFileName, lineNumber, pos);
1907             break;
1908         }
1909         if (pos > expectedPos) {
1910             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1911                 testFileName, lineNumber, expectedPos);
1912             break;
1913         }
1914         pos = bi->next();
1915         expectedI++;
1916     }
1917
1918     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1919         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1920             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1921     }
1922 }
1923
1924
1925
1926 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1927 //---------------------------------------------------------------------------------------
1928 //
1929 //   classs RBBIMonkeyKind
1930 //
1931 //      Monkey Test for Break Iteration
1932 //      Abstract interface class.   Concrete derived classes independently
1933 //      implement the break rules for different iterator types.
1934 //
1935 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1936 //      testing, but works purely in terms of the interface defined here.
1937 //
1938 //---------------------------------------------------------------------------------------
1939 class RBBIMonkeyKind {
1940 public:
1941     // Return a UVector of UnicodeSets, representing the character classes used
1942     //   for this type of iterator.
1943     virtual  UVector  *charClasses() = 0;
1944
1945     // Set the test text on which subsequent calls to next() will operate
1946     virtual  void      setText(const UnicodeString &s) = 0;
1947
1948     // Find the next break postion, starting from the prev break position, or from zero.
1949     // Return -1 after reaching end of string.
1950     virtual  int32_t   next(int32_t i) = 0;
1951
1952     virtual ~RBBIMonkeyKind();
1953     UErrorCode       deferredStatus;
1954
1955
1956 protected:
1957     RBBIMonkeyKind();
1958
1959 private:
1960 };
1961
1962 RBBIMonkeyKind::RBBIMonkeyKind() {
1963     deferredStatus = U_ZERO_ERROR;
1964 }
1965
1966 RBBIMonkeyKind::~RBBIMonkeyKind() {
1967 }
1968
1969
1970 //----------------------------------------------------------------------------------------
1971 //
1972 //   Random Numbers.  Similar to standard lib rand() and srand()
1973 //                    Not using library to
1974 //                      1.  Get same results on all platforms.
1975 //                      2.  Get access to current seed, to more easily reproduce failures.
1976 //
1977 //---------------------------------------------------------------------------------------
1978 static uint32_t m_seed = 1;
1979
1980 static uint32_t m_rand()
1981 {
1982     m_seed = m_seed * 1103515245 + 12345;
1983     return (uint32_t)(m_seed/65536) % 32768;
1984 }
1985
1986
1987 //------------------------------------------------------------------------------------------
1988 //
1989 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1990 //                             of RBBIMonkeyKind.
1991 //
1992 //------------------------------------------------------------------------------------------
1993 class RBBICharMonkey: public RBBIMonkeyKind {
1994 public:
1995     RBBICharMonkey();
1996     virtual          ~RBBICharMonkey();
1997     virtual  UVector *charClasses();
1998     virtual  void     setText(const UnicodeString &s);
1999     virtual  int32_t  next(int32_t i);
2000 private:
2001     UVector   *fSets;
2002
2003     UnicodeSet  *fCRLFSet;
2004     UnicodeSet  *fControlSet;
2005     UnicodeSet  *fExtendSet;
2006     UnicodeSet  *fPrependSet;
2007     UnicodeSet  *fSpacingSet;
2008     UnicodeSet  *fLSet;
2009     UnicodeSet  *fVSet;
2010     UnicodeSet  *fTSet;
2011     UnicodeSet  *fLVSet;
2012     UnicodeSet  *fLVTSet;
2013     UnicodeSet  *fHangulSet;
2014     UnicodeSet  *fAnySet;
2015
2016     const UnicodeString *fText;
2017 };
2018
2019
2020 RBBICharMonkey::RBBICharMonkey() {
2021     UErrorCode  status = U_ZERO_ERROR;
2022
2023     fText = NULL;
2024
2025     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2026     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2027     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2028     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2029     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2030     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2031     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2032     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2033     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2034     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2035     fHangulSet  = new UnicodeSet();
2036     fHangulSet->addAll(*fLSet);
2037     fHangulSet->addAll(*fVSet);
2038     fHangulSet->addAll(*fTSet);
2039     fHangulSet->addAll(*fLVSet);
2040     fHangulSet->addAll(*fLVTSet);
2041     fAnySet     = new UnicodeSet(0, 0x10ffff);
2042
2043     fSets       = new UVector(status);
2044     fSets->addElement(fCRLFSet,    status);
2045     fSets->addElement(fControlSet, status);
2046     fSets->addElement(fExtendSet,  status);
2047     if (!fPrependSet->isEmpty()) {
2048         fSets->addElement(fPrependSet, status);
2049     }
2050     fSets->addElement(fSpacingSet, status);
2051     fSets->addElement(fHangulSet,  status);
2052     fSets->addElement(fAnySet,     status);
2053     if (U_FAILURE(status)) {
2054         deferredStatus = status;
2055     }
2056 }
2057
2058
2059 void RBBICharMonkey::setText(const UnicodeString &s) {
2060     fText = &s;
2061 }
2062
2063
2064
2065 int32_t RBBICharMonkey::next(int32_t prevPos) {
2066     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2067                               //   break position being tested.  The candidate break
2068                               //   location is before p2.
2069
2070     int     breakPos = -1;
2071
2072     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2073
2074     if (U_FAILURE(deferredStatus)) {
2075         return -1;
2076     }
2077
2078     // Previous break at end of string.  return DONE.
2079     if (prevPos >= fText->length()) {
2080         return -1;
2081     }
2082     p0 = p1 = p2 = p3 = prevPos;
2083     c3 =  fText->char32At(prevPos);
2084     c0 = c1 = c2 = 0;
2085
2086     // Loop runs once per "significant" character position in the input text.
2087     for (;;) {
2088         // Move all of the positions forward in the input string.
2089         p0 = p1;  c0 = c1;
2090         p1 = p2;  c1 = c2;
2091         p2 = p3;  c2 = c3;
2092
2093         // Advancd p3 by one codepoint
2094         p3 = fText->moveIndex32(p3, 1);
2095         c3 = fText->char32At(p3);
2096
2097         if (p1 == p2) {
2098             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2099             continue;
2100         }
2101         if (p2 == fText->length()) {
2102             // Reached end of string.  Always a break position.
2103             break;
2104         }
2105
2106         // Rule  GB3   CR x LF
2107         //     No Extend or Format characters may appear between the CR and LF,
2108         //     which requires the additional check for p2 immediately following p1.
2109         //
2110         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2111             continue;
2112         }
2113
2114         // Rule (GB4).   ( Control | CR | LF ) <break>
2115         if (fControlSet->contains(c1) ||
2116             c1 == 0x0D ||
2117             c1 == 0x0A)  {
2118             break;
2119         }
2120
2121         // Rule (GB5)    <break>  ( Control | CR | LF )
2122         //
2123         if (fControlSet->contains(c2) ||
2124             c2 == 0x0D ||
2125             c2 == 0x0A)  {
2126             break;
2127         }
2128
2129
2130         // Rule (GB6)  L x ( L | V | LV | LVT )
2131         if (fLSet->contains(c1) &&
2132                (fLSet->contains(c2)  ||
2133                 fVSet->contains(c2)  ||
2134                 fLVSet->contains(c2) ||
2135                 fLVTSet->contains(c2))) {
2136             continue;
2137         }
2138
2139         // Rule (GB7)    ( LV | V )  x  ( V | T )
2140         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2141             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2142             continue;
2143         }
2144
2145         // Rule (GB8)    ( LVT | T)  x T
2146         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2147             fTSet->contains(c2))  {
2148             continue;
2149         }
2150
2151         // Rule (GB9)    Numeric x ALetter
2152         if (fExtendSet->contains(c2))  {
2153             continue;
2154         }
2155
2156         // Rule (GB9a)   x  SpacingMark
2157         if (fSpacingSet->contains(c2)) {
2158             continue;
2159         }
2160
2161         // Rule (GB9b)   Prepend x
2162         if (fPrependSet->contains(c1)) {
2163             continue;
2164         }
2165
2166         // Rule (GB10)  Any  <break>  Any
2167         break;
2168     }
2169
2170     breakPos = p2;
2171     return breakPos;
2172 }
2173
2174
2175
2176 UVector  *RBBICharMonkey::charClasses() {
2177     return fSets;
2178 }
2179
2180
2181 RBBICharMonkey::~RBBICharMonkey() {
2182     delete fSets;
2183     delete fCRLFSet;
2184     delete fControlSet;
2185     delete fExtendSet;
2186     delete fPrependSet;
2187     delete fSpacingSet;
2188     delete fLSet;
2189     delete fVSet;
2190     delete fTSet;
2191     delete fLVSet;
2192     delete fLVTSet;
2193     delete fHangulSet;
2194     delete fAnySet;
2195 }
2196
2197 //------------------------------------------------------------------------------------------
2198 //
2199 //   class RBBIWordMonkey      Word Break specific implementation
2200 //                             of RBBIMonkeyKind.
2201 //
2202 //------------------------------------------------------------------------------------------
2203 class RBBIWordMonkey: public RBBIMonkeyKind {
2204 public:
2205     RBBIWordMonkey();
2206     virtual          ~RBBIWordMonkey();
2207     virtual  UVector *charClasses();
2208     virtual  void     setText(const UnicodeString &s);
2209     virtual int32_t   next(int32_t i);
2210 private:
2211     UVector      *fSets;
2212
2213     UnicodeSet  *fCRSet;
2214     UnicodeSet  *fLFSet;
2215     UnicodeSet  *fNewlineSet;
2216     UnicodeSet  *fKatakanaSet;
2217     UnicodeSet  *fALetterSet;
2218     UnicodeSet  *fMidNumLetSet;
2219     UnicodeSet  *fMidLetterSet;
2220     UnicodeSet  *fMidNumSet;
2221     UnicodeSet  *fNumericSet;
2222     UnicodeSet  *fFormatSet;
2223     UnicodeSet  *fOtherSet;
2224     UnicodeSet  *fExtendSet;
2225     UnicodeSet  *fExtendNumLetSet;
2226
2227     RegexMatcher  *fMatcher;
2228
2229     const UnicodeString  *fText;
2230 };
2231
2232
2233 RBBIWordMonkey::RBBIWordMonkey()
2234 {
2235     UErrorCode  status = U_ZERO_ERROR;
2236
2237     fSets            = new UVector(status);
2238
2239     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2240     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2241     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2242     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2243     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2244     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2245     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2246     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2247     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2248     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2249     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2250     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2251
2252     fOtherSet        = new UnicodeSet();
2253     if(U_FAILURE(status)) {
2254       deferredStatus = status;
2255       return;
2256     }
2257
2258     fOtherSet->complement();
2259     fOtherSet->removeAll(*fCRSet);
2260     fOtherSet->removeAll(*fLFSet);
2261     fOtherSet->removeAll(*fNewlineSet);
2262     fOtherSet->removeAll(*fKatakanaSet);
2263     fOtherSet->removeAll(*fALetterSet);
2264     fOtherSet->removeAll(*fMidLetterSet);
2265     fOtherSet->removeAll(*fMidNumSet);
2266     fOtherSet->removeAll(*fNumericSet);
2267     fOtherSet->removeAll(*fExtendNumLetSet);
2268     fOtherSet->removeAll(*fFormatSet);
2269     fOtherSet->removeAll(*fExtendSet);
2270     // Inhibit dictionary characters from being tested at all.
2271     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2272
2273     fSets->addElement(fCRSet,        status);
2274     fSets->addElement(fLFSet,        status);
2275     fSets->addElement(fNewlineSet,   status);
2276     fSets->addElement(fALetterSet,   status);
2277     fSets->addElement(fKatakanaSet,  status);
2278     fSets->addElement(fMidLetterSet, status);
2279     fSets->addElement(fMidNumLetSet, status);
2280     fSets->addElement(fMidNumSet,    status);
2281     fSets->addElement(fNumericSet,   status);
2282     fSets->addElement(fFormatSet,    status);
2283     fSets->addElement(fExtendSet,    status);
2284     fSets->addElement(fOtherSet,     status);
2285     fSets->addElement(fExtendNumLetSet, status);
2286
2287     if (U_FAILURE(status)) {
2288         deferredStatus = status;
2289     }
2290 }
2291
2292 void RBBIWordMonkey::setText(const UnicodeString &s) {
2293     fText       = &s;
2294 }
2295
2296
2297 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2298     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2299                               //   break position being tested.  The candidate break
2300                               //   location is before p2.
2301
2302     int     breakPos = -1;
2303
2304     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2305
2306     if (U_FAILURE(deferredStatus)) {
2307         return -1;
2308     }
2309
2310     // Prev break at end of string.  return DONE.
2311     if (prevPos >= fText->length()) {
2312         return -1;
2313     }
2314     p0 = p1 = p2 = p3 = prevPos;
2315     c3 =  fText->char32At(prevPos);
2316     c0 = c1 = c2 = 0;
2317
2318     // Loop runs once per "significant" character position in the input text.
2319     for (;;) {
2320         // Move all of the positions forward in the input string.
2321         p0 = p1;  c0 = c1;
2322         p1 = p2;  c1 = c2;
2323         p2 = p3;  c2 = c3;
2324
2325         // Advancd p3 by    X(Extend | Format)*   Rule 4
2326         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2327         do {
2328             p3 = fText->moveIndex32(p3, 1);
2329             c3 = fText->char32At(p3);
2330             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2331                break;
2332             };
2333         }
2334         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2335
2336
2337         if (p1 == p2) {
2338             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2339             continue;
2340         }
2341         if (p2 == fText->length()) {
2342             // Reached end of string.  Always a break position.
2343             break;
2344         }
2345
2346         // Rule  (3)   CR x LF
2347         //     No Extend or Format characters may appear between the CR and LF,
2348         //     which requires the additional check for p2 immediately following p1.
2349         //
2350         if (c1==0x0D && c2==0x0A) {
2351             continue;
2352         }
2353
2354         // Rule (3a)  Break before and after newlines (including CR and LF)
2355         //
2356         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2357             break;
2358         };
2359         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2360             break;
2361         };
2362
2363         // Rule (5).   ALetter x ALetter
2364         if (fALetterSet->contains(c1) &&
2365             fALetterSet->contains(c2))  {
2366             continue;
2367         }
2368
2369         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2370         //
2371         if ( fALetterSet->contains(c1)   &&
2372              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2373              fALetterSet->contains(c3)) {
2374             continue;
2375         }
2376
2377
2378         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2379         if (fALetterSet->contains(c0) &&
2380             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2381             fALetterSet->contains(c2)) {
2382             continue;
2383         }
2384
2385         // Rule (8)    Numeric x Numeric
2386         if (fNumericSet->contains(c1) &&
2387             fNumericSet->contains(c2))  {
2388             continue;
2389         }
2390
2391         // Rule (9)    ALetter x Numeric
2392         if (fALetterSet->contains(c1) &&
2393             fNumericSet->contains(c2))  {
2394             continue;
2395         }
2396
2397         // Rule (10)    Numeric x ALetter
2398         if (fNumericSet->contains(c1) &&
2399             fALetterSet->contains(c2))  {
2400             continue;
2401         }
2402
2403         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2404         if (fNumericSet->contains(c0) &&
2405             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2406             fNumericSet->contains(c2)) {
2407             continue;
2408         }
2409
2410         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2411         if (fNumericSet->contains(c1) &&
2412             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2413             fNumericSet->contains(c3)) {
2414             continue;
2415         }
2416
2417         // Rule (13)  Katakana x Katakana
2418         if (fKatakanaSet->contains(c1) &&
2419             fKatakanaSet->contains(c2))  {
2420             continue;
2421         }
2422
2423         // Rule 13a
2424         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2425              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2426              fExtendNumLetSet->contains(c2)) {
2427                 continue;
2428              }
2429
2430         // Rule 13b
2431         if (fExtendNumLetSet->contains(c1) &&
2432                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2433                 fKatakanaSet->contains(c2)))  {
2434                 continue;
2435              }
2436
2437         // Rule 14.  Break found here.
2438         break;
2439     }
2440
2441     breakPos = p2;
2442     return breakPos;
2443 }
2444
2445
2446 UVector  *RBBIWordMonkey::charClasses() {
2447     return fSets;
2448 }
2449
2450
2451 RBBIWordMonkey::~RBBIWordMonkey() {
2452     delete fSets;
2453     delete fCRSet;
2454     delete fLFSet;
2455     delete fNewlineSet;
2456     delete fKatakanaSet;
2457     delete fALetterSet;
2458     delete fMidNumLetSet;
2459     delete fMidLetterSet;
2460     delete fMidNumSet;
2461     delete fNumericSet;
2462     delete fFormatSet;
2463     delete fExtendSet;
2464     delete fExtendNumLetSet;
2465     delete fOtherSet;
2466 }
2467
2468
2469
2470
2471 //------------------------------------------------------------------------------------------
2472 //
2473 //   class RBBISentMonkey      Sentence Break specific implementation
2474 //                             of RBBIMonkeyKind.
2475 //
2476 //------------------------------------------------------------------------------------------
2477 class RBBISentMonkey: public RBBIMonkeyKind {
2478 public:
2479     RBBISentMonkey();
2480     virtual          ~RBBISentMonkey();
2481     virtual  UVector *charClasses();
2482     virtual  void     setText(const UnicodeString &s);
2483     virtual int32_t   next(int32_t i);
2484 private:
2485     int               moveBack(int posFrom);
2486     int               moveForward(int posFrom);
2487     UChar32           cAt(int pos);
2488
2489     UVector      *fSets;
2490
2491     UnicodeSet  *fSepSet;
2492     UnicodeSet  *fFormatSet;
2493     UnicodeSet  *fSpSet;
2494     UnicodeSet  *fLowerSet;
2495     UnicodeSet  *fUpperSet;
2496     UnicodeSet  *fOLetterSet;
2497     UnicodeSet  *fNumericSet;
2498     UnicodeSet  *fATermSet;
2499     UnicodeSet  *fSContinueSet;
2500     UnicodeSet  *fSTermSet;
2501     UnicodeSet  *fCloseSet;
2502     UnicodeSet  *fOtherSet;
2503     UnicodeSet  *fExtendSet;
2504
2505     const UnicodeString  *fText;
2506
2507 };
2508
2509 RBBISentMonkey::RBBISentMonkey()
2510 {
2511     UErrorCode  status = U_ZERO_ERROR;
2512
2513     fSets            = new UVector(status);
2514
2515     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2516     //                       set and made into character classes of their own.  For the monkey impl,
2517     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2518     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2519     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2520     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2521     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2522     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2523     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2524     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2525     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2526     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2527     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2528     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2529     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2530     fOtherSet        = new UnicodeSet();
2531
2532     if(U_FAILURE(status)) {
2533       deferredStatus = status;
2534       return;
2535     }
2536
2537     fOtherSet->complement();
2538     fOtherSet->removeAll(*fSepSet);
2539     fOtherSet->removeAll(*fFormatSet);
2540     fOtherSet->removeAll(*fSpSet);
2541     fOtherSet->removeAll(*fLowerSet);
2542     fOtherSet->removeAll(*fUpperSet);
2543     fOtherSet->removeAll(*fOLetterSet);
2544     fOtherSet->removeAll(*fNumericSet);
2545     fOtherSet->removeAll(*fATermSet);
2546     fOtherSet->removeAll(*fSContinueSet);
2547     fOtherSet->removeAll(*fSTermSet);
2548     fOtherSet->removeAll(*fCloseSet);
2549     fOtherSet->removeAll(*fExtendSet);
2550
2551     fSets->addElement(fSepSet,       status);
2552     fSets->addElement(fFormatSet,    status);
2553     fSets->addElement(fSpSet,        status);
2554     fSets->addElement(fLowerSet,     status);
2555     fSets->addElement(fUpperSet,     status);
2556     fSets->addElement(fOLetterSet,   status);
2557     fSets->addElement(fNumericSet,   status);
2558     fSets->addElement(fATermSet,     status);
2559     fSets->addElement(fSContinueSet, status);
2560     fSets->addElement(fSTermSet,     status);
2561     fSets->addElement(fCloseSet,     status);
2562     fSets->addElement(fOtherSet,     status);
2563     fSets->addElement(fExtendSet,    status);
2564
2565     if (U_FAILURE(status)) {
2566         deferredStatus = status;
2567     }
2568 }
2569
2570
2571
2572 void RBBISentMonkey::setText(const UnicodeString &s) {
2573     fText       = &s;
2574 }
2575
2576 UVector  *RBBISentMonkey::charClasses() {
2577     return fSets;
2578 }
2579
2580
2581 //  moveBack()   Find the "significant" code point preceding the index i.
2582 //               Skips over ($Extend | $Format)* .
2583 //
2584 int RBBISentMonkey::moveBack(int i) {
2585     if (i <= 0) {
2586         return -1;
2587     }
2588     UChar32   c;
2589     int32_t   j = i;
2590     do {
2591         j = fText->moveIndex32(j, -1);
2592         c = fText->char32At(j);
2593     }
2594     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2595     return j;
2596
2597  }
2598
2599
2600 int RBBISentMonkey::moveForward(int i) {
2601     if (i>=fText->length()) {
2602         return fText->length();
2603     }
2604     UChar32   c;
2605     int32_t   j = i;
2606     do {
2607         j = fText->moveIndex32(j, 1);
2608         c = cAt(j);
2609     }
2610     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2611     return j;
2612 }
2613
2614 UChar32 RBBISentMonkey::cAt(int pos) {
2615     if (pos<0 || pos>=fText->length()) {
2616         return -1;
2617     } else {
2618         return fText->char32At(pos);
2619     }
2620 }
2621
2622 int32_t RBBISentMonkey::next(int32_t prevPos) {
2623     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2624                               //   break position being tested.  The candidate break
2625                               //   location is before p2.
2626
2627     int     breakPos = -1;
2628
2629     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2630     UChar32 c;
2631
2632     if (U_FAILURE(deferredStatus)) {
2633         return -1;
2634     }
2635
2636     // Prev break at end of string.  return DONE.
2637     if (prevPos >= fText->length()) {
2638         return -1;
2639     }
2640     p0 = p1 = p2 = p3 = prevPos;
2641     c3 =  fText->char32At(prevPos);
2642     c0 = c1 = c2 = 0;
2643
2644     // Loop runs once per "significant" character position in the input text.
2645     for (;;) {
2646         // Move all of the positions forward in the input string.
2647         p0 = p1;  c0 = c1;
2648         p1 = p2;  c1 = c2;
2649         p2 = p3;  c2 = c3;
2650
2651         // Advancd p3 by    X(Extend | Format)*   Rule 4
2652         p3 = moveForward(p3);
2653         c3 = cAt(p3);
2654
2655         // Rule (3)  CR x LF
2656         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2657             continue;
2658         }
2659
2660         // Rule (4).   Sep  <break>
2661         if (fSepSet->contains(c1)) {
2662             p2 = p1+1;   // Separators don't combine with Extend or Format.
2663             break;
2664         }
2665
2666         if (p2 >= fText->length()) {
2667             // Reached end of string.  Always a break position.
2668             break;
2669         }
2670
2671         if (p2 == prevPos) {
2672             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2673             continue;
2674         }
2675
2676         // Rule (6).   ATerm x Numeric
2677         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2678             continue;
2679         }
2680
2681         // Rule (7).  Upper ATerm  x  Uppper
2682         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2683             continue;
2684         }
2685
2686         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2687         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2688         //                  note to the Unicode 5.0 documents.
2689         int p8 = p1;
2690         while (fSpSet->contains(cAt(p8))) {
2691             p8 = moveBack(p8);
2692         }
2693         while (fCloseSet->contains(cAt(p8))) {
2694             p8 = moveBack(p8);
2695         }
2696         if (fATermSet->contains(cAt(p8))) {
2697             p8=p2;
2698             for (;;) {
2699                 c = cAt(p8);
2700                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2701                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2702                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2703                     break;
2704                 }
2705                 p8 = moveForward(p8);
2706             }
2707             if (fLowerSet->contains(cAt(p8))) {
2708                 continue;
2709             }
2710         }
2711
2712         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2713         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2714             p8 = p1;
2715             while (fSpSet->contains(cAt(p8))) {
2716                 p8 = moveBack(p8);
2717             }
2718             while (fCloseSet->contains(cAt(p8))) {
2719                 p8 = moveBack(p8);
2720             }
2721             c = cAt(p8);
2722             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2723                 continue;
2724             }
2725         }
2726
2727         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2728         int p9 = p1;
2729         while (fCloseSet->contains(cAt(p9))) {
2730             p9 = moveBack(p9);
2731         }
2732         c = cAt(p9);
2733         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2734             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2735                 continue;
2736             }
2737         }
2738
2739         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2740         int p10 = p1;
2741         while (fSpSet->contains(cAt(p10))) {
2742             p10 = moveBack(p10);
2743         }
2744         while (fCloseSet->contains(cAt(p10))) {
2745             p10 = moveBack(p10);
2746         }
2747         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2748             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2749                 continue;
2750             }
2751         }
2752
2753         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2754         int p11 = p1;
2755         if (fSepSet->contains(cAt(p11))) {
2756             p11 = moveBack(p11);
2757         }
2758         while (fSpSet->contains(cAt(p11))) {
2759             p11 = moveBack(p11);
2760         }
2761         while (fCloseSet->contains(cAt(p11))) {
2762             p11 = moveBack(p11);
2763         }
2764         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2765             break;
2766         }
2767
2768         //  Rule (12)  Any x Any
2769         continue;
2770     }
2771     breakPos = p2;
2772     return breakPos;
2773 }
2774
2775 RBBISentMonkey::~RBBISentMonkey() {
2776     delete fSets;
2777     delete fSepSet;
2778     delete fFormatSet;
2779     delete fSpSet;
2780     delete fLowerSet;
2781     delete fUpperSet;
2782     delete fOLetterSet;
2783     delete fNumericSet;
2784     delete fATermSet;
2785     delete fSContinueSet;
2786     delete fSTermSet;
2787     delete fCloseSet;
2788     delete fOtherSet;
2789     delete fExtendSet;
2790 }
2791
2792
2793
2794 //-------------------------------------------------------------------------------------------
2795 //
2796 //  RBBILineMonkey
2797 //
2798 //-------------------------------------------------------------------------------------------
2799
2800 class RBBILineMonkey: public RBBIMonkeyKind {
2801 public:
2802     RBBILineMonkey();
2803     virtual          ~RBBILineMonkey();
2804     virtual  UVector *charClasses();
2805     virtual  void     setText(const UnicodeString &s);
2806     virtual  int32_t  next(int32_t i);
2807     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2808 private:
2809     UVector      *fSets;
2810
2811     UnicodeSet  *fBK;
2812     UnicodeSet  *fCR;
2813     UnicodeSet  *fLF;
2814     UnicodeSet  *fCM;
2815     UnicodeSet  *fNL;
2816     UnicodeSet  *fSG;
2817     UnicodeSet  *fWJ;
2818     UnicodeSet  *fZW;
2819     UnicodeSet  *fGL;
2820     UnicodeSet  *fCB;
2821     UnicodeSet  *fSP;
2822     UnicodeSet  *fB2;
2823     UnicodeSet  *fBA;
2824     UnicodeSet  *fBB;
2825     UnicodeSet  *fHY;
2826     UnicodeSet  *fH2;
2827     UnicodeSet  *fH3;
2828     UnicodeSet  *fCL;
2829     UnicodeSet  *fCP;
2830     UnicodeSet  *fEX;
2831     UnicodeSet  *fIN;
2832     UnicodeSet  *fJL;
2833     UnicodeSet  *fJV;
2834     UnicodeSet  *fJT;
2835     UnicodeSet  *fNS;
2836     UnicodeSet  *fOP;
2837     UnicodeSet  *fQU;
2838     UnicodeSet  *fIS;
2839     UnicodeSet  *fNU;
2840     UnicodeSet  *fPO;
2841     UnicodeSet  *fPR;
2842     UnicodeSet  *fSY;
2843     UnicodeSet  *fAI;
2844     UnicodeSet  *fAL;
2845     UnicodeSet  *fCJ;
2846     UnicodeSet  *fHL;
2847     UnicodeSet  *fID;
2848     UnicodeSet  *fSA;
2849     UnicodeSet  *fXX;
2850
2851     BreakIterator  *fCharBI;
2852
2853     const UnicodeString  *fText;
2854     int32_t              *fOrigPositions;
2855
2856     RegexMatcher         *fNumberMatcher;
2857     RegexMatcher         *fLB11Matcher;
2858 };
2859
2860
2861 RBBILineMonkey::RBBILineMonkey()
2862 {
2863     UErrorCode  status = U_ZERO_ERROR;
2864
2865     fSets  = new UVector(status);
2866
2867     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2868     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2869     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2870     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2871     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2872     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2873     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2874     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2875     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2876     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2877     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2878     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2879     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2880     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2881     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2882     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2883     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2884     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2885     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2886     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2887     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2888     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2889     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2890     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2891     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2892     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2893     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2894     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2895     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2896     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2897     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2898     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2899     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2900     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2901     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2902     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2903     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2904     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2905     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2906
2907     if (U_FAILURE(status)) {
2908         deferredStatus = status;
2909         fCharBI = NULL;
2910         fNumberMatcher = NULL;
2911         return;
2912     }
2913
2914     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2915     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2916     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2917     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2918
2919     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2920
2921     fSets->addElement(fBK, status);
2922     fSets->addElement(fCR, status);
2923     fSets->addElement(fLF, status);
2924     fSets->addElement(fCM, status);
2925     fSets->addElement(fNL, status);
2926     fSets->addElement(fWJ, status);
2927     fSets->addElement(fZW, status);
2928     fSets->addElement(fGL, status);
2929     fSets->addElement(fCB, status);
2930     fSets->addElement(fSP, status);
2931     fSets->addElement(fB2, status);
2932     fSets->addElement(fBA, status);
2933     fSets->addElement(fBB, status);
2934     fSets->addElement(fHY, status);
2935     fSets->addElement(fH2, status);
2936     fSets->addElement(fH3, status);
2937     fSets->addElement(fCL, status);
2938     fSets->addElement(fCP, status);
2939     fSets->addElement(fEX, status);
2940     fSets->addElement(fIN, status);
2941     fSets->addElement(fJL, status);
2942     fSets->addElement(fJT, status);
2943     fSets->addElement(fJV, status);
2944     fSets->addElement(fNS, status);
2945     fSets->addElement(fOP, status);
2946     fSets->addElement(fQU, status);
2947     fSets->addElement(fIS, status);
2948     fSets->addElement(fNU, status);
2949     fSets->addElement(fPO, status);
2950     fSets->addElement(fPR, status);
2951     fSets->addElement(fSY, status);
2952     fSets->addElement(fAI, status);
2953     fSets->addElement(fAL, status);
2954     fSets->addElement(fHL, status);
2955     fSets->addElement(fID, status);
2956     fSets->addElement(fWJ, status);
2957     fSets->addElement(fSA, status);
2958     fSets->addElement(fSG, status);
2959
2960     const char *rules =
2961             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
2962             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
2963             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
2964             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
2965             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
2966             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
2967
2968     fNumberMatcher = new RegexMatcher(
2969         UnicodeString(rules, -1, US_INV), 0, status);
2970
2971     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2972
2973     if (U_FAILURE(status)) {
2974         deferredStatus = status;
2975     }
2976 }
2977
2978
2979 void RBBILineMonkey::setText(const UnicodeString &s) {
2980     fText       = &s;
2981     fCharBI->setText(s);
2982     fNumberMatcher->reset(s);
2983 }
2984
2985 //
2986 //  rule9Adjust
2987 //     Line Break TR rules 9 and 10 implementation.
2988 //     This deals with combining marks and other sequences that
2989 //     that must be treated as if they were something other than what they actually are.
2990 //
2991 //     This is factored out into a separate function because it must be applied twice for
2992 //     each potential break, once to the chars before the position being checked, then
2993 //     again to the text following the possible break.
2994 //
2995 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
2996     if (pos == -1) {
2997         // Invalid initial position.  Happens during the warmup iteration of the
2998         //   main loop in next().
2999         return;
3000     }
3001
3002     int32_t  nPos = *nextPos;
3003
3004     // LB 9  Keep combining sequences together.
3005     //  advance over any CM class chars.  Note that Line Break CM is different
3006     //  from the normal Grapheme Extend property.
3007     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3008           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3009         for (;;) {
3010             *nextChar = fText->char32At(nPos);
3011             if (!fCM->contains(*nextChar)) {
3012                 break;
3013             }
3014             nPos = fText->moveIndex32(nPos, 1);
3015         }
3016     }
3017
3018
3019     // LB 9 Treat X CM* as if it were x.
3020     //       No explicit action required.
3021
3022     // LB 10  Treat any remaining combining mark as AL
3023     if (fCM->contains(*posChar)) {
3024         *posChar = 0x41;   // thisChar = 'A';
3025     }
3026
3027     // Push the updated nextPos and nextChar back to our caller.
3028     // This only makes a difference if posChar got bigger by consuming a
3029     // combining sequence.
3030     *nextPos  = nPos;
3031     *nextChar = fText->char32At(nPos);
3032 }
3033
3034
3035
3036 int32_t RBBILineMonkey::next(int32_t startPos) {
3037     UErrorCode status = U_ZERO_ERROR;
3038     int32_t    pos;       //  Index of the char following a potential break position
3039     UChar32    thisChar;  //  Character at above position "pos"
3040
3041     int32_t    prevPos;   //  Index of the char preceding a potential break position
3042     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3043                           //   and thisChar may not be adjacent because combining
3044                           //   characters between them will be ignored.
3045
3046     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3047     UChar32    prevCharX2;
3048
3049     int32_t    nextPos;   //  Index of the next character following pos.
3050                           //     Usually skips over combining marks.
3051     int32_t    nextCPPos; //  Index of the code point following "pos."
3052                           //     May point to a combining mark.
3053     int32_t    tPos;      //  temp value.
3054     UChar32    c;
3055
3056     if (U_FAILURE(deferredStatus)) {
3057         return -1;
3058     }
3059
3060     if (startPos >= fText->length()) {
3061         return -1;
3062     }
3063
3064
3065     // Initial values for loop.  Loop will run the first time without finding breaks,
3066     //                           while the invalid values shift out and the "this" and
3067     //                           "prev" positions are filled in with good values.
3068     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3069     thisChar = prevChar  = prevCharX2 = 0;
3070     nextPos  = nextCPPos = startPos;
3071
3072
3073     // Loop runs once per position in the test text, until a break position
3074     //  is found.
3075     for (;;) {
3076         prevPosX2 = prevPos;
3077         prevCharX2 = prevChar;
3078
3079         prevPos   = pos;
3080         prevChar  = thisChar;
3081
3082         pos       = nextPos;
3083         thisChar  = fText->char32At(pos);
3084
3085         nextCPPos = fText->moveIndex32(pos, 1);
3086         nextPos   = nextCPPos;
3087
3088         // Rule LB2 - Break at end of text.
3089         if (pos >= fText->length()) {
3090             break;
3091         }
3092
3093         // Rule LB 9 - adjust for combining sequences.
3094         //             We do this one out-of-order because the adjustment does not change anything
3095         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3096         //             be applied.
3097         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3098         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3099         c = fText->char32At(nextPos);
3100         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3101
3102         // If the loop is still warming up - if we haven't shifted the initial
3103         //   -1 positions out of prevPos yet - loop back to advance the
3104         //    position in the input without any further looking for breaks.
3105         if (prevPos == -1) {
3106             continue;
3107         }
3108
3109         // LB 4  Always break after hard line breaks,
3110         if (fBK->contains(prevChar)) {
3111             break;
3112         }
3113
3114         // LB 5  Break after CR, LF, NL, but not inside CR LF
3115         if (prevChar == 0x0d && thisChar == 0x0a) {
3116             continue;
3117         }
3118         if (prevChar == 0x0d ||
3119             prevChar == 0x0a ||
3120             prevChar == 0x85)  {
3121             break;
3122         }
3123
3124         // LB 6  Don't break before hard line breaks
3125         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3126             fBK->contains(thisChar)) {
3127                 continue;
3128         }
3129
3130
3131         // LB 7  Don't break before spaces or zero-width space.
3132         if (fSP->contains(thisChar)) {
3133             continue;
3134         }
3135
3136         if (fZW->contains(thisChar)) {
3137             continue;
3138         }
3139
3140         // LB 8  Break after zero width space
3141         if (fZW->contains(prevChar)) {
3142             break;
3143         }
3144
3145         // LB 9, 10  Already done, at top of loop.
3146         //
3147
3148
3149         // LB 11  Do not break before or after WORD JOINER and related characters.
3150         //    x  WJ
3151         //    WJ  x
3152         //
3153         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3154             continue;
3155         }
3156
3157         // LB 12
3158         //    GL  x
3159         if (fGL->contains(prevChar)) {
3160             continue;
3161         }
3162
3163         // LB 12a
3164         //    [^SP BA HY] x GL
3165         if (!(fSP->contains(prevChar) ||
3166               fBA->contains(prevChar) ||
3167               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3168             continue;
3169         }
3170
3171
3172
3173         // LB 13  Don't break before closings.
3174         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3175         //        fall into LB 17 and the more general number regular expression.
3176         //
3177         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3178             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3179                                          fEX->contains(thisChar)  ||
3180             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3181             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3182             continue;
3183         }
3184
3185         // LB 14 Don't break after OP SP*
3186         //       Scan backwards, checking for this sequence.
3187         //       The OP char could include combining marks, so we actually check for
3188         //           OP CM* SP*
3189         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3190         //       sequence into a ID char, so before scanning back through spaces,
3191         //       verify that prevChar is indeed a space.  The prevChar variable
3192         //       may differ from fText[prevPos]
3193         tPos = prevPos;
3194         if (fSP->contains(prevChar)) {
3195             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3196                 tPos=fText->moveIndex32(tPos, -1);
3197             }
3198         }
3199         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3200             tPos=fText->moveIndex32(tPos, -1);
3201         }
3202         if (fOP->contains(fText->char32At(tPos))) {
3203             continue;
3204         }
3205
3206
3207         // LB 15    QU SP* x OP
3208         if (fOP->contains(thisChar)) {
3209             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3210             int tPos = prevPos;
3211             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3212                 tPos = fText->moveIndex32(tPos, -1);
3213             }
3214             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3215                 tPos = fText->moveIndex32(tPos, -1);
3216             }
3217             if (fQU->contains(fText->char32At(tPos))) {
3218                 continue;
3219             }
3220         }
3221
3222
3223
3224         // LB 16   (CL | CP) SP* x NS
3225         //    Scan backwards for SP* CM* (CL | CP)
3226         if (fNS->contains(thisChar)) {
3227             int tPos = prevPos;
3228             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3229                 tPos = fText->moveIndex32(tPos, -1);
3230             }
3231             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3232                 tPos = fText->moveIndex32(tPos, -1);
3233             }
3234             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3235                 continue;
3236             }
3237         }
3238
3239
3240         // LB 17        B2 SP* x B2
3241         if (fB2->contains(thisChar)) {
3242             //  Scan backwards, checking for the B2 CM* SP* sequence.
3243             tPos = prevPos;
3244             if (fSP->contains(prevChar)) {
3245                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3246                     tPos=fText->moveIndex32(tPos, -1);
3247                 }
3248             }
3249             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3250                 tPos=fText->moveIndex32(tPos, -1);
3251             }
3252             if (fB2->contains(fText->char32At(tPos))) {
3253                 continue;
3254             }
3255         }
3256
3257
3258         // LB 18    break after space
3259         if (fSP->contains(prevChar)) {
3260             break;
3261         }
3262
3263         // LB 19
3264         //    x   QU
3265         //    QU  x
3266         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3267             continue;
3268         }
3269
3270         // LB 20  Break around a CB
3271         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3272             break;
3273         }
3274
3275         // LB 21
3276         if (fBA->contains(thisChar) ||
3277             fHY->contains(thisChar) ||
3278             fNS->contains(thisChar) ||
3279             fBB->contains(prevChar) )   {
3280             continue;
3281         }
3282
3283         // LB 21a
3284         //   HL (HY | BA) x
3285         if (fHL->contains(prevCharX2) &&
3286                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3287             continue;
3288         }
3289
3290         // LB 22
3291         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3292             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3293             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3294             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3295             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3296             continue;
3297         }
3298
3299
3300         // LB 23    ID x PO
3301         //          AL x NU
3302         //          HL x NU
3303         //          NU x AL
3304         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3305             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3306             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3307             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3308             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3309             continue;
3310         }
3311
3312         // LB 24  Do not break between prefix and letters or ideographs.
3313         //        PR x ID
3314         //        PR x (AL | HL)
3315         //        PO x (AL | HL)
3316         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3317             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3318             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3319             continue;
3320         }
3321
3322
3323
3324         // LB 25    Numbers
3325         if (fNumberMatcher->lookingAt(prevPos, status)) {
3326             if (U_FAILURE(status)) {
3327                 break;
3328             }
3329             // Matched a number.  But could have been just a single digit, which would
3330             //    not represent a "no break here" between prevChar and thisChar
3331             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3332             if (numEndIdx > pos) {
3333                 // Number match includes at least our two chars being checked
3334                 if (numEndIdx > nextPos) {
3335                     // Number match includes additional chars.  Update pos and nextPos
3336                     //   so that next loop iteration will continue at the end of the number,
3337                     //   checking for breaks between last char in number & whatever follows.
3338                     pos = nextPos = numEndIdx;
3339                     do {
3340                         pos = fText->moveIndex32(pos, -1);
3341                         thisChar = fText->char32At(pos);
3342                     } while (fCM->contains(thisChar));
3343                 }
3344                 continue;
3345             }
3346         }
3347
3348
3349         // LB 26 Do not break a Korean syllable.
3350         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3351                                         fJV->contains(thisChar) ||
3352                                         fH2->contains(thisChar) ||
3353                                         fH3->contains(thisChar))) {
3354                                             continue;
3355                                         }
3356
3357         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3358             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3359                 continue;
3360         }
3361
3362         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3363             fJT->contains(thisChar)) {
3364                 continue;
3365         }
3366
3367         // LB 27 Treat a Korean Syllable Block the same as ID.
3368         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3369             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3370             fIN->contains(thisChar)) {
3371                 continue;
3372             }
3373         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3374             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3375             fPO->contains(thisChar)) {
3376                 continue;
3377             }
3378         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3379             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3380                 continue;
3381             }
3382
3383
3384
3385         // LB 28  Do not break between alphabetics ("at").
3386         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3387             continue;
3388         }
3389
3390         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3391         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3392             continue;
3393         }
3394
3395         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3396         //          (AL | NU) x OP
3397         //          CP x (AL | NU)
3398         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3399             continue;
3400         }
3401         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3402             continue;
3403         }
3404
3405         // LB 31    Break everywhere else
3406         break;
3407
3408     }
3409
3410     return pos;
3411 }
3412
3413
3414 UVector  *RBBILineMonkey::charClasses() {
3415     return fSets;
3416 }
3417
3418
3419 RBBILineMonkey::~RBBILineMonkey() {
3420     delete fSets;
3421
3422     delete fBK;
3423     delete fCR;
3424     delete fLF;
3425     delete fCM;
3426     delete fNL;
3427     delete fWJ;
3428     delete fZW;
3429     delete fGL;
3430     delete fCB;
3431     delete fSP;
3432     delete fB2;
3433     delete fBA;
3434     delete fBB;
3435     delete fHY;
3436     delete fH2;
3437     delete fH3;
3438     delete fCL;
3439     delete fCP;
3440     delete fEX;
3441     delete fIN;
3442     delete fJL;
3443     delete fJV;
3444     delete fJT;
3445     delete fNS;
3446     delete fOP;
3447     delete fQU;
3448     delete fIS;
3449     delete fNU;
3450     delete fPO;
3451     delete fPR;
3452     delete fSY;
3453     delete fAI;
3454     delete fAL;
3455     delete fCJ;
3456     delete fHL;
3457     delete fID;
3458     delete fSA;
3459     delete fSG;
3460     delete fXX;
3461
3462     delete fCharBI;
3463     delete fNumberMatcher;
3464 }
3465
3466
3467 //-------------------------------------------------------------------------------------------
3468 //
3469 //   TestMonkey
3470 //
3471 //     params
3472 //       seed=nnnnn        Random number starting seed.
3473 //                         Setting the seed allows errors to be reproduced.
3474 //       loop=nnn          Looping count.  Controls running time.
3475 //                         -1:  run forever.
3476 //                          0 or greater:  run length.
3477 //
3478 //       type = char | word | line | sent | title
3479 //
3480 //-------------------------------------------------------------------------------------------
3481
3482 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3483     int32_t val = defaultVal;
3484     name.append(" *= *(-?\\d+)");
3485     UErrorCode status = U_ZERO_ERROR;
3486     RegexMatcher m(name, params, 0, status);
3487     if (m.find()) {
3488         // The param exists.  Convert the string to an int.
3489         char valString[100];
3490         int32_t paramLength = m.end(1, status) - m.start(1, status);
3491         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3492             paramLength = (int32_t)(sizeof(valString)-2);
3493         }
3494         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3495         val = strtol(valString,  NULL, 10);
3496
3497         // Delete this parameter from the params string.
3498         m.reset();
3499         params = m.replaceFirst("", status);
3500     }
3501     U_ASSERT(U_SUCCESS(status));
3502     return val;
3503 }
3504 #endif
3505
3506 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3507                                     BreakIterator *bi,
3508                                     int expected[],
3509                                     int expectedcount)
3510 {
3511     int count = 0;
3512     int i = 0;
3513     int forward[50];
3514     bi->setText(ustr);
3515     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3516         forward[count] = i;
3517         if (count < expectedcount && expected[count] != i) {
3518             test->errln("break forward test failed: expected %d but got %d",
3519                         expected[count], i);
3520             break;
3521         }
3522         count ++;
3523     }
3524     if (count != expectedcount) {
3525         printStringBreaks(ustr, expected, expectedcount);
3526         test->errln("break forward test failed: missed %d match",
3527                     expectedcount - count);
3528         return;
3529     }
3530     // testing boundaries
3531     for (i = 1; i < expectedcount; i ++) {
3532         int j = expected[i - 1];
3533         if (!bi->isBoundary(j)) {
3534             printStringBreaks(ustr, expected, expectedcount);
3535             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3536             return;
3537         }
3538         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3539             if (bi->isBoundary(j)) {
3540                 printStringBreaks(ustr, expected, expectedcount);
3541                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3542                 return;
3543             }
3544         }
3545     }
3546
3547     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3548         count --;
3549         if (forward[count] != i) {
3550             test->errln("happy break test previous() failed: expected %d but got %d",
3551                         forward[count], i);
3552             break;
3553         }
3554     }
3555     if (count != 0) {
3556         printStringBreaks(ustr, expected, expectedcount);
3557         test->errln("break test previous() failed: missed a match");
3558         return;
3559     }
3560
3561     // testing preceding
3562     for (i = 0; i < expectedcount - 1; i ++) {
3563         // int j = expected[i] + 1;
3564         int j = ustr.moveIndex32(expected[i], 1);
3565         for (; j <= expected[i + 1]; j ++) {
3566             if (bi->preceding(j) != expected[i]) {
3567                 printStringBreaks(ustr, expected, expectedcount);
3568                 test->errln("preceding(): Not expecting boundary at position %d", j);
3569                 return;
3570             }
3571         }
3572     }
3573 }
3574
3575 void RBBITest::TestWordBreaks(void)
3576 {
3577 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3578
3579     Locale        locale("en");
3580     UErrorCode    status = U_ZERO_ERROR;
3581     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3582     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3583     static const char *strlist[] =
3584     {
3585     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3586     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
3587     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3588     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3589     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
3590     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3591     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3592     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
3593     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3594     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3595     "\\u2027\\U000e0067\\u0a47\\u00b7",
3596     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3597     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3598     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3599     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3600     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3601     "\\u0027\\u11af\\U000e0057\\u0602",
3602     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3603     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3604     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3605     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3606     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3607     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3608     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3609     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3610     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3611     "\\u58f4\\U000e0049\\u20e7\\u2027",
3612     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3613     "\\ua183\\u102d\\u0bec\\u003a",
3614     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3615     "\\u003a\\u0e57\\u0fad\\u002e",
3616     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3617     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3618     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3619     "\\u003a\\u0664\\u00b7\\u1fba",
3620     "\\u003b\\u0027\\u00b7\\u47a3",
3621     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
3622     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3623     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3624     };
3625     int loop;
3626     if (U_FAILURE(status)) {
3627         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3628         return;
3629     }
3630     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3631         // printf("looping %d\n", loop);
3632         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3633         // RBBICharMonkey monkey;
3634         RBBIWordMonkey monkey;
3635
3636         int expected[50];
3637         int expectedcount = 0;
3638
3639         monkey.setText(ustr);
3640         int i;
3641         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3642             expected[expectedcount ++] = i;
3643         }
3644
3645         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3646     }
3647     delete bi;
3648 #endif
3649 }
3650
3651 void RBBITest::TestWordBoundary(void)
3652 {
3653     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3654     Locale        locale("en");
3655     UErrorCode    status = U_ZERO_ERROR;
3656     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3657     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3658     UChar         str[50];
3659     static const char *strlist[] =
3660     {
3661     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3662     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3663     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3664     "\\u2027\\U000e0067\\u0a47\\u00b7",
3665     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3666     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3667     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3668     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3669     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3670     "\\u0027\\u11af\\U000e0057\\u0602",
3671     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3672     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3673     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3674     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3675     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3676     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3677     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3678     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3679     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3680     "\\u58f4\\U000e0049\\u20e7\\u2027",
3681     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3682     "\\ua183\\u102d\\u0bec\\u003a",
3683     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3684     "\\u003a\\u0e57\\u0fad\\u002e",
3685     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3686     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3687     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3688     "\\u003a\\u0664\\u00b7\\u1fba",
3689     "\\u003b\\u0027\\u00b7\\u47a3",
3690     };
3691     int loop;
3692     if (U_FAILURE(status)) {
3693         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3694         return;
3695     }
3696     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3697         // printf("looping %d\n", loop);
3698         u_unescape(strlist[loop], str, 20);
3699         UnicodeString ustr(str);
3700         int forward[50];
3701         int count = 0;
3702
3703         bi->setText(ustr);
3704         int prev = 0;
3705         int i;
3706         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3707             forward[count ++] = i;
3708             if (i > prev) {
3709                 int j;
3710                 for (j = prev + 1; j < i; j ++) {
3711                     if (bi->isBoundary(j)) {
3712                         printStringBreaks(ustr, forward, count);
3713                         errln("happy boundary test failed: expected %d not a boundary",
3714                                j);
3715                         return;
3716                     }
3717                 }
3718             }
3719             if (!bi->isBoundary(i)) {
3720                 printStringBreaks(ustr, forward, count);
3721                 errln("happy boundary test failed: expected %d a boundary",
3722                        i);
3723                 return;
3724             }
3725             prev = i;
3726         }
3727     }
3728     delete bi;
3729 }
3730
3731 void RBBITest::TestLineBreaks(void)
3732 {
3733 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3734     Locale        locale("en");
3735     UErrorCode    status = U_ZERO_ERROR;
3736     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3737     const int32_t  STRSIZE = 50;
3738     UChar         str[STRSIZE];
3739     static const char *strlist[] =
3740     {
3741      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3742      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3743              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3744      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3745              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3746      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3747      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3748      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3749      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3750      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3751      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3752      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3753      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3754      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3755      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3756      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3757      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3758      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3759      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3760      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3761      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3762      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3763      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3764      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3765      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3766      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3767      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3768      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3769      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3770      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3771      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3772      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3773      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3774      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3775      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3776      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3777      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3778      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3779      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3780      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3781      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3782      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3783          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3784          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3785          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3786      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3787          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3788     };
3789     int loop;
3790     TEST_ASSERT_SUCCESS(status);
3791     if (U_FAILURE(status)) {
3792         return;
3793     }
3794     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3795         // printf("looping %d\n", loop);
3796         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3797         if (t >= STRSIZE) {
3798             TEST_ASSERT(FALSE);
3799             continue;
3800         }
3801
3802
3803         UnicodeString ustr(str);
3804         RBBILineMonkey monkey;
3805         if (U_FAILURE(monkey.deferredStatus)) {
3806             continue;
3807         }
3808
3809         const int EXPECTEDSIZE = 50;
3810         int expected[EXPECTEDSIZE];
3811         int expectedcount = 0;
3812
3813         monkey.setText(ustr);
3814         int i;
3815         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3816             if (expectedcount >= EXPECTEDSIZE) {
3817                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3818                 return;
3819             }
3820             expected[expectedcount ++] = i;
3821         }
3822
3823         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3824     }
3825     delete bi;
3826 #endif
3827 }
3828
3829 void RBBITest::TestSentBreaks(void)
3830 {
3831 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3832     Locale        locale("en");
3833     UErrorCode    status = U_ZERO_ERROR;
3834     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3835     UChar         str[200];
3836     static const char *strlist[] =
3837     {
3838      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3839      "This\n",
3840      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3841      "\"Sentence ending with a quote.\" Bye.",
3842      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3843      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3844      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3845      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3846      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3847      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3848      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3849              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3850              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3851              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3852      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3853              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3854              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3855              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3856              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3857              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3858     };
3859     int loop;
3860     if (U_FAILURE(status)) {
3861         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3862         return;
3863     }
3864     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3865         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3866         UnicodeString ustr(str);
3867
3868         RBBISentMonkey monkey;
3869         if (U_FAILURE(monkey.deferredStatus)) {
3870             continue;
3871         }
3872
3873         const int EXPECTEDSIZE = 50;
3874         int expected[EXPECTEDSIZE];
3875         int expectedcount = 0;
3876
3877         monkey.setText(ustr);
3878         int i;
3879         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3880             if (expectedcount >= EXPECTEDSIZE) {
3881                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3882                 return;
3883             }
3884             expected[expectedcount ++] = i;
3885         }
3886
3887         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3888     }
3889     delete bi;
3890 #endif
3891 }
3892
3893 void RBBITest::TestMonkey(char *params) {
3894 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3895
3896     UErrorCode     status    = U_ZERO_ERROR;
3897     int32_t        loopCount = 500;
3898     int32_t        seed      = 1;
3899     UnicodeString  breakType = "all";
3900     Locale         locale("en");
3901     UBool          useUText  = FALSE;
3902
3903     if (quick == FALSE) {
3904         loopCount = 10000;
3905     }
3906
3907     if (params) {
3908         UnicodeString p(params);
3909         loopCount = getIntParam("loop", p, loopCount);
3910         seed      = getIntParam("seed", p, seed);
3911
3912         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
3913         if (m.find()) {
3914             breakType = m.group(1, status);
3915             m.reset();
3916             p = m.replaceFirst("", status);
3917         }
3918
3919         RegexMatcher u(" *utext", p, 0, status);
3920         if (u.find()) {
3921             useUText = TRUE;
3922             u.reset();
3923             p = u.replaceFirst("", status);
3924         }
3925
3926
3927         // m.reset(p);
3928         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
3929             // Each option is stripped out of the option string as it is processed.
3930             // All options have been checked.  The option string should have been completely emptied..
3931             char buf[100];
3932             p.extract(buf, sizeof(buf), NULL, status);
3933             buf[sizeof(buf)-1] = 0;
3934             errln("Unrecognized or extra parameter:  %s\n", buf);
3935             return;
3936         }
3937
3938     }
3939
3940     if (breakType == "char" || breakType == "all") {
3941         RBBICharMonkey  m;
3942         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3943         if (U_SUCCESS(status)) {
3944             RunMonkey(bi, m, "char", seed, loopCount, useUText);
3945             if (breakType == "all" && useUText==FALSE) {
3946                 // Also run a quick test with UText when "all" is specified
3947                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
3948             }
3949         }
3950         else {
3951             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
3952         }
3953         delete bi;
3954     }
3955
3956     if (breakType == "word" || breakType == "all") {
3957         logln("Word Break Monkey Test");
3958         RBBIWordMonkey  m;
3959         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
3960         if (U_SUCCESS(status)) {
3961             RunMonkey(bi, m, "word", seed, loopCount, useUText);
3962         }
3963         else {
3964             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
3965         }
3966         delete bi;
3967     }
3968
3969     if (breakType == "line" || breakType == "all") {
3970         logln("Line Break Monkey Test");
3971         RBBILineMonkey  m;
3972         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
3973         if (loopCount >= 10) {
3974             loopCount = loopCount / 5;   // Line break runs slower than the others.
3975         }
3976         if (U_SUCCESS(status)) {
3977             RunMonkey(bi, m, "line", seed, loopCount, useUText);
3978         }
3979         else {
3980             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3981         }
3982         delete bi;
3983     }
3984
3985     if (breakType == "sent" || breakType == "all"  ) {
3986         logln("Sentence Break Monkey Test");
3987         RBBISentMonkey  m;
3988         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
3989         if (loopCount >= 10) {
3990             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
3991         }
3992         if (U_SUCCESS(status)) {
3993             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
3994         }
3995         else {
3996             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
3997         }
3998         delete bi;
3999     }
4000
4001 #endif
4002 }
4003
4004 //
4005 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4006 //    Parameters:
4007 //       bi      - the break iterator to use
4008 //       mk      - MonkeyKind, abstraction for obtaining expected results
4009 //       name    - Name of test (char, word, etc.) for use in error messages
4010 //       seed    - Seed for starting random number generator (parameter from user)
4011 //       numIterations
4012 //
4013 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4014                          int32_t numIterations, UBool useUText) {
4015
4016 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4017
4018     const int32_t    TESTSTRINGLEN = 500;
4019     UnicodeString    testText;
4020     int32_t          numCharClasses;
4021     UVector          *chClasses;
4022     int              expected[TESTSTRINGLEN*2 + 1];
4023     int              expectedCount = 0;
4024     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4025     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4026     char             reverseBreaks[TESTSTRINGLEN*2+1];
4027     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4028     char             followingBreaks[TESTSTRINGLEN*2+1];
4029     char             precedingBreaks[TESTSTRINGLEN*2+1];
4030     int              i;
4031     int              loopCount = 0;
4032
4033     m_seed = seed;
4034
4035     numCharClasses = mk.charClasses()->size();
4036     chClasses      = mk.charClasses();
4037
4038     // Check for errors that occured during the construction of the MonkeyKind object.
4039     //  Can't report them where they occured because errln() is a method coming from intlTest,
4040     //  and is not visible outside of RBBITest :-(
4041     if (U_FAILURE(mk.deferredStatus)) {
4042         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4043         return;
4044     }
4045
4046     // Verify that the character classes all have at least one member.
4047     for (i=0; i<numCharClasses; i++) {
4048         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4049         if (s == NULL || s->size() == 0) {
4050             errln("Character Class #%d is null or of zero size.", i);
4051             return;
4052         }
4053     }
4054
4055     while (loopCount < numIterations || numIterations == -1) {
4056         if (numIterations == -1 && loopCount % 10 == 0) {
4057             // If test is running in an infinite loop, display a periodic tic so
4058             //   we can tell that it is making progress.
4059             fprintf(stderr, ".");
4060         }
4061         // Save current random number seed, so that we can recreate the random numbers
4062         //   for this loop iteration in event of an error.
4063         seed = m_seed;
4064
4065         // Populate a test string with data.
4066         testText.truncate(0);
4067         for (i=0; i<TESTSTRINGLEN; i++) {
4068             int32_t  aClassNum = m_rand() % numCharClasses;
4069             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4070             int32_t   charIdx = m_rand() % classSet->size();
4071             UChar32   c = classSet->charAt(charIdx);
4072             if (c < 0) {   // TODO:  deal with sets containing strings.
4073                 errln("c < 0");
4074                 break;
4075             }
4076             testText.append(c);
4077         }
4078
4079         // Calculate the expected results for this test string.
4080         mk.setText(testText);
4081         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4082         expectedBreaks[0] = 1;
4083         int32_t breakPos = 0;
4084         expectedCount = 0;
4085         for (;;) {
4086             breakPos = mk.next(breakPos);
4087             if (breakPos == -1) {
4088                 break;
4089             }
4090             if (breakPos > testText.length()) {
4091                 errln("breakPos > testText.length()");
4092             }
4093             expectedBreaks[breakPos] = 1;
4094             U_ASSERT(expectedCount<testText.length());
4095             expected[expectedCount ++] = breakPos;
4096         }
4097
4098         // Find the break positions using forward iteration
4099         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4100         if (useUText) {
4101             UErrorCode status = U_ZERO_ERROR;
4102             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4103             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4104             bi->setText(testUText, status);
4105             TEST_ASSERT_SUCCESS(status);
4106             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4107                                       //  This UText can be closed immediately, so long as the
4108                                       //  testText string continues to exist.
4109         } else {
4110             bi->setText(testText);
4111         }
4112
4113         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4114             if (i < 0 || i > testText.length()) {
4115                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4116                 break;
4117             }
4118             forwardBreaks[i] = 1;
4119         }
4120
4121         // Find the break positions using reverse iteration
4122         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4123         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4124             if (i < 0 || i > testText.length()) {
4125                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4126                 break;
4127             }
4128             reverseBreaks[i] = 1;
4129         }
4130
4131         // Find the break positions using isBoundary() tests.
4132         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4133         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4134         for (i=0; i<=testText.length(); i++) {
4135             isBoundaryBreaks[i] = bi->isBoundary(i);
4136         }
4137
4138
4139         // Find the break positions using the following() function.
4140         // printf(".");
4141         memset(followingBreaks, 0, sizeof(followingBreaks));
4142         int32_t   lastBreakPos = 0;
4143         followingBreaks[0] = 1;
4144         for (i=0; i<testText.length(); i++) {
4145             breakPos = bi->following(i);
4146             if (breakPos <= i ||
4147                 breakPos < lastBreakPos ||
4148                 breakPos > testText.length() ||
4149                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4150                 errln("%s break monkey test: "
4151                     "Out of range value returned by BreakIterator::following().\n"
4152                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4153                          name, seed, i, breakPos, lastBreakPos);
4154                 break;
4155             }
4156             followingBreaks[breakPos] = 1;
4157             lastBreakPos = breakPos;
4158         }
4159
4160         // Find the break positions using the preceding() function.
4161         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4162         lastBreakPos = testText.length();
4163         precedingBreaks[testText.length()] = 1;
4164         for (i=testText.length(); i>0; i--) {
4165             breakPos = bi->preceding(i);
4166             if (breakPos >= i ||
4167                 breakPos > lastBreakPos ||
4168                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4169                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4170                 errln("%s break monkey test: "
4171                     "Out of range value returned by BreakIterator::preceding().\n"
4172                     "index=%d;  prev returned %d; lastBreak=%d" ,
4173                     name,  i, breakPos, lastBreakPos);
4174                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4175                     precedingBreaks[i] = 2;   // Forces an error.
4176                 }
4177             } else {
4178                 if (breakPos >= 0) {
4179                     precedingBreaks[breakPos] = 1;
4180                 }
4181                 lastBreakPos = breakPos;
4182             }
4183         }
4184
4185         // Compare the expected and actual results.
4186         for (i=0; i<=testText.length(); i++) {
4187             const char *errorType = NULL;
4188             if  (forwardBreaks[i] != expectedBreaks[i]) {
4189                 errorType = "next()";
4190             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4191                 errorType = "previous()";
4192             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4193                 errorType = "isBoundary()";
4194             } else if (followingBreaks[i] != expectedBreaks[i]) {
4195                 errorType = "following()";
4196             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4197                 errorType = "preceding()";
4198             }
4199
4200
4201             if (errorType != NULL) {
4202                 // Format a range of the test text that includes the failure as
4203                 //  a data item that can be included in the rbbi test data file.
4204
4205                 // Start of the range is the last point where expected and actual results
4206                 //   both agreed that there was a break position.
4207                 int startContext = i;
4208                 int32_t count = 0;
4209                 for (;;) {
4210                     if (startContext==0) { break; }
4211                     startContext --;
4212                     if (expectedBreaks[startContext] != 0) {
4213                         if (count == 2) break;
4214                         count ++;
4215                     }
4216                 }
4217
4218                 // End of range is two expected breaks past the start position.
4219                 int endContext = i + 1;
4220                 int ci;
4221                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4222                     for (;;) {
4223                         if (endContext >= testText.length()) {break;}
4224                         if (expectedBreaks[endContext-1] != 0) {
4225                             if (count == 0) break;
4226                             count --;
4227                         }
4228                         endContext ++;
4229                     }
4230                 }
4231
4232                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4233                 UnicodeString errorText = "<data>";
4234                 /***if (strcmp(errorType, "next()") == 0) {
4235                     startContext = 0;
4236                     endContext = testText.length();
4237
4238                     printStringBreaks(testText, expected, expectedCount);
4239                 }***/
4240
4241                 for (ci=startContext; ci<endContext;) {
4242                     UnicodeString hexChars("0123456789abcdef");
4243                     UChar32  c;
4244                     int      bn;
4245                     c = testText.char32At(ci);
4246                     if (ci == i) {
4247                         // This is the location of the error.
4248                         errorText.append("<?>");
4249                     } else if (expectedBreaks[ci] != 0) {
4250                         // This a non-error expected break position.
4251                         errorText.append("\\");
4252                     }
4253                     if (c < 0x10000) {
4254                         errorText.append("\\u");
4255                         for (bn=12; bn>=0; bn-=4) {
4256                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4257                         }
4258                     } else {
4259                         errorText.append("\\U");
4260                         for (bn=28; bn>=0; bn-=4) {
4261                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4262                         }
4263                     }
4264                     ci = testText.moveIndex32(ci, 1);
4265                 }
4266                 errorText.append("\\");
4267                 errorText.append("</data>\n");
4268
4269                 // Output the error
4270                 char  charErrorTxt[500];
4271                 UErrorCode status = U_ZERO_ERROR;
4272                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4273                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4274                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4275
4276                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4277                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4278                     errorType, seed, i, charErrorTxt);
4279                 break;
4280             }
4281         }
4282
4283         loopCount++;
4284     }
4285 #endif
4286 }
4287
4288
4289 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4290 //             This test checks the initial patch,
4291 //             which is to just keep it from crashing.  Correct word boundaries
4292 //             await a proper fix to the dictionary code.
4293 //
4294 void RBBITest::TestBug5532(void)  {
4295    // Text includes a mixture of Thai and Latin.
4296    const unsigned char utf8Data[] = {
4297            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4298            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4299            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4300            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4301            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4302            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4303            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4304            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4305            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4306            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4307            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4308
4309     UErrorCode status = U_ZERO_ERROR;
4310     UText utext=UTEXT_INITIALIZER;
4311     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4312     TEST_ASSERT_SUCCESS(status);
4313
4314     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4315     TEST_ASSERT_SUCCESS(status);
4316     if (U_SUCCESS(status)) {
4317         bi->setText(&utext, status);
4318         TEST_ASSERT_SUCCESS(status);
4319
4320         int32_t breakCount = 0;
4321         int32_t previousBreak = -1;
4322         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4323             // For now, just make sure that the break iterator doesn't hang.
4324             TEST_ASSERT(previousBreak < bi->current());
4325             previousBreak = bi->current();
4326         }
4327         TEST_ASSERT(breakCount > 0);
4328     }
4329     delete bi;
4330     utext_close(&utext);
4331 }
4332
4333
4334 //
4335 //  TestDebug    -  A place-holder test for debugging purposes.
4336 //                  For putting in fragments of other tests that can be invoked
4337 //                  for tracing  without a lot of unwanted extra stuff happening.
4338 //
4339 void RBBITest::TestDebug(void) {
4340 #if 0
4341     UErrorCode   status = U_ZERO_ERROR;
4342     int pos = 0;
4343     int ruleStatus = 0;
4344
4345     RuleBasedBreakIterator* bi =
4346        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4347        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4348        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4349     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4350     // UnicodeString s("Aaa.  Bcd");
4351     s = s.unescape();
4352     bi->setText(s);
4353     UBool r = bi->isBoundary(8);
4354     printf("%s", r?"true":"false");
4355     return;
4356     pos = bi->last();
4357     do {
4358         // ruleStatus = bi->getRuleStatus();
4359         printf("%d\t%d\n", pos, ruleStatus);
4360         pos = bi->previous();
4361     } while (pos != BreakIterator::DONE);
4362 #endif
4363 }
4364
4365 void RBBITest::TestProperties() {
4366     UErrorCode errorCode = U_ZERO_ERROR;
4367     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4368     if (!prependSet.isEmpty()) {
4369         errln(
4370             "[:GCB=Prepend:] is not empty any more. "
4371             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4372             "change this test to the opposite condition.");
4373     }
4374 }
4375
4376 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */