icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2015, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include "utypeinfo.h"  // for 'typeid' to work
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_BREAK_ITERATION
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/brkiter.h"
  20 #include "unicode/rbbi.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/utf16.h"
  23 #include "unicode/ucnv.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uniset.h"
  26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  27 #include "unicode/regex.h"
  28 #endif
  29 #include "unicode/ustring.h"
  30 #include "unicode/utext.h"
  31 #include "intltest.h"
  32 #include "rbbitst.h"
  33 #include <string.h>
  34 #include "charstr.h"
  35 #include "uvector.h"
  36 #include "uvectr32.h"
  37 #include <stdio.h>
  38 #include <stdlib.h>
  39 #include "unicode/numfmt.h"
  40 #include "unicode/uscript.h"
  41 #include "cmemory.h"
  42
  43 #define TEST_ASSERT(x) {if (!(x)) { \
  44     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  45
  46 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  47     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  48
  49
  50 //---------------------------------------------
  51 // runIndexedTest
  52 //---------------------------------------------
  53
  54
  55 //  Note:  Before adding new tests to this file, check whether the desired test data can
  56 //         simply be added to the file testdata/rbbitest.txt.  In most cases it can,
  57 //         it's much less work than writing a new test, diagnostic output in the event of failures
  58 //         is good, and the test data file will is shared with ICU4J, so eventually the test
  59 //         will run there as well, without additional effort.
  60
  61 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  62 {
  63     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  64
  65     switch (index) {
  66 #if !UCONFIG_NO_FILE_IO
  67         case 0: name = "TestBug4153072";
  68             if(exec) TestBug4153072();                         break;
  69 #else
  70         case 0: name = "skip";
  71             break;
  72 #endif
  73
  74         case 1: name = "skip";
  75             break;
  76         case 2: name = "TestStatusReturn";
  77             if(exec) TestStatusReturn();                       break;
  78
  79 #if !UCONFIG_NO_FILE_IO
  80         case 3: name = "TestUnicodeFiles";
  81             if(exec) TestUnicodeFiles();                       break;
  82         case 4: name = "TestEmptyString";
  83             if(exec) TestEmptyString();                        break;
  84 #else
  85         case 3: case 4: name = "skip";
  86             break;
  87 #endif
  88
  89         case 5: name = "TestGetAvailableLocales";
  90             if(exec) TestGetAvailableLocales();                break;
  91
  92         case 6: name = "TestGetDisplayName";
  93             if(exec) TestGetDisplayName();                     break;
  94
  95 #if !UCONFIG_NO_FILE_IO
  96         case 7: name = "TestEndBehaviour";
  97             if(exec) TestEndBehaviour();                       break;
  98         case 8: case 9: case 10: name = "skip";
  99              break;
 100         case 11: name = "TestWordBreaks";
 101              if(exec) TestWordBreaks();                        break;
 102         case 12: name = "TestWordBoundary";
 103              if(exec) TestWordBoundary();                      break;
 104         case 13: name = "TestLineBreaks";
 105              if(exec) TestLineBreaks();                        break;
 106         case 14: name = "TestSentBreaks";
 107              if(exec) TestSentBreaks();                        break;
 108         case 15: name = "TestExtended";
 109              if(exec) TestExtended();                          break;
 110 #else
 111         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 112              break;
 113 #endif
 114
 115 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 116         case 16:
 117             name = "TestMonkey"; if(exec)  TestMonkey(params); break;
 118 #else
 119         case 16:
 120              name = "skip";                                    break;
 121 #endif
 122
 123 #if !UCONFIG_NO_FILE_IO
 124         case 17: name = "TestBug3818";
 125             if(exec) TestBug3818();                            break;
 126 #else
 127         case 17: name = "skip";
 128             break;
 129 #endif
 130
 131         case 18: name = "skip";
 132             break;
 133         case 19: name = "TestDebug";
 134             if(exec) TestDebug();                              break;
 135         case 20: name = "skip";
 136             break;
 137
 138 #if !UCONFIG_NO_FILE_IO
 139         case 21: name = "TestBug5775";
 140             if (exec) TestBug5775();                           break;
 141 #else
 142         case 21: name = "skip";
 143             break;
 144 #endif
 145
 146         case 22: name = "TestBug9983";
 147             if (exec) TestBug9983();                           break;
 148         case 23: name = "TestDictRules";
 149             if (exec) TestDictRules();                         break;
 150         case 24: name = "TestBug5532";
 151             if (exec) TestBug5532();                           break;
 152         default: name = ""; break; //needed to end loop
 153     }
 154 }
 155
 156
 157 //---------------------------------------------------------------------------
 158 //
 159 //   class BITestData   Holds a set of Break iterator test data and results
 160 //                      Includes
 161 //                         - the string data to be broken
 162 //                         - a vector of the expected break positions.
 163 //                         - a vector of source line numbers for the data,
 164 //                               (to help see where errors occured.)
 165 //                         - The expected break tag values.
 166 //                         - Vectors of actual break positions and tag values.
 167 //                         - Functions for comparing actual with expected and
 168 //                            reporting errors.
 169 //
 170 //----------------------------------------------------------------------------
 171 class BITestData {
 172 public:
 173     UnicodeString    fDataToBreak;
 174     UVector          fExpectedBreakPositions;
 175     UVector          fExpectedTags;
 176     UVector          fLineNum;
 177     UVector          fActualBreakPositions;   // Test Results.
 178     UVector          fActualTags;
 179
 180     BITestData(UErrorCode &status);
 181     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 182     void             checkResults(const char *heading, RBBITest *test);
 183     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 184     void             clearResults();
 185 };
 186
 187 //
 188 // Constructor.
 189 //
 190 BITestData::BITestData(UErrorCode &status)
 191 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 192   fActualTags(status)
 193 {
 194 }
 195
 196 //
 197 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 198 //                 The macro form collects the line number, which is helpful
 199 //                 when tracking down failures.
 200 //
 201 //                 A null data item is inserted at the start of each test's data
 202 //                  to put the starting zero into the data list.  The position saved for
 203 //                  each non-null item is its ending position.
 204 //
 205 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 206 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 207     if (U_FAILURE(status)) {return;}
 208     if (data != NULL) {
 209         fDataToBreak.append(CharsToUnicodeString(data));
 210     }
 211     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 212     fExpectedTags.addElement(tag, status);
 213     fLineNum.addElement(lineNum, status);
 214 }
 215
 216
 217 //
 218 //  checkResults.   Compare the actual and expected break positions, report any differences.
 219 //
 220 void BITestData::checkResults(const char *heading, RBBITest *test) {
 221     int32_t   expectedIndex = 0;
 222     int32_t   actualIndex = 0;
 223
 224     for (;;) {
 225         // If we've run through both the expected and actual results vectors, we're done.
 226         //   break out of the loop.
 227         if (expectedIndex >= fExpectedBreakPositions.size() &&
 228             actualIndex   >= fActualBreakPositions.size()) {
 229             break;
 230         }
 231
 232
 233         if (expectedIndex >= fExpectedBreakPositions.size()) {
 234             err(heading, test, expectedIndex-1, actualIndex);
 235             actualIndex++;
 236             continue;
 237         }
 238
 239         if (actualIndex >= fActualBreakPositions.size()) {
 240             err(heading, test, expectedIndex, actualIndex-1);
 241             expectedIndex++;
 242             continue;
 243         }
 244
 245         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 246             err(heading, test, expectedIndex, actualIndex);
 247             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 248             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 249                 actualIndex++;
 250             } else {
 251                 expectedIndex++;
 252             }
 253             continue;
 254         }
 255
 256         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 257             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 258                 heading, fLineNum.elementAt(expectedIndex),
 259                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 260         }
 261
 262         actualIndex++;
 263         expectedIndex++;
 264     }
 265 }
 266
 267 //
 268 //  err   -  An error was found.  Report it, along with information about where the
 269 //                                incorrectly broken test data appeared in the source file.
 270 //
 271 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 272 {
 273     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 274     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 275     int32_t   o        = 0;
 276     int32_t   line     = fLineNum.elementAti(expectedIdx);
 277     if (expectedIdx > 0) {
 278         // The line numbers are off by one because a premature break occurs somewhere
 279         //    within the previous item, rather than at the start of the current (expected) item.
 280         //    We want to report the offset of the unexpected break from the start of
 281         //      this previous item.
 282         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 283     }
 284     if (actual < expected) {
 285         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 286     } else {
 287         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 288     }
 289 }
 290
 291
 292 void BITestData::clearResults() {
 293     fActualBreakPositions.removeAllElements();
 294     fActualTags.removeAllElements();
 295 }
 296
 297
 298 //--------------------------------------------------------------------------------------
 299 //
 300 //    RBBITest    constructor and destructor
 301 //
 302 //--------------------------------------------------------------------------------------
 303
 304 RBBITest::RBBITest() {
 305 }
 306
 307
 308 RBBITest::~RBBITest() {
 309 }
 310
 311 //-----------------------------------------------------------------------------------
 312 //
 313 //   Test for status {tag} return value from break rules.
 314 //        TODO:  a more thorough test.
 315 //
 316 //-----------------------------------------------------------------------------------
 317 void RBBITest::TestStatusReturn() {
 318      UnicodeString rulesString1("$Letters = [:L:];\n"
 319                                   "$Numbers = [:N:];\n"
 320                                   "$Letters+{1};\n"
 321                                   "$Numbers+{2};\n"
 322                                   "Help\\ {4}/me\\!;\n"
 323                                   "[^$Letters $Numbers];\n"
 324                                   "!.*;\n", -1, US_INV);
 325      UnicodeString testString1  = "abc123..abc Help me Help me!";
 326                                 // 01234567890123456789012345678
 327      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 328      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 329
 330      UErrorCode status=U_ZERO_ERROR;
 331      UParseError    parseError;
 332
 333      BreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 334      if(U_FAILURE(status)) {
 335          dataerrln("FAIL : in construction - %s", u_errorName(status));
 336      } else {
 337          int32_t  pos;
 338          int32_t  i = 0;
 339          bi->setText(testString1);
 340          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 341              if (pos != bounds1[i]) {
 342                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 343                  break;
 344              }
 345
 346              int tag = bi->getRuleStatus();
 347              if (tag != brkStatus[i]) {
 348                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 349                  break;
 350              }
 351              i++;
 352          }
 353      }
 354      delete bi;
 355 }
 356
 357
 358 static void printStringBreaks(UText *tstr, int expected[], int expectedCount) {
 359     UErrorCode status = U_ZERO_ERROR;
 360     char name[100];
 361     printf("code    alpha extend alphanum type word sent line name\n");
 362     int nextExpectedIndex = 0;
 363     utext_setNativeIndex(tstr, 0);
 364     for (int j = 0; j < utext_nativeLength(tstr); j=utext_getNativeIndex(tstr)) {
 365         if (nextExpectedIndex < expectedCount && j >= expected[nextExpectedIndex] ) {
 366             printf("------------------------------------------------ %d\n", j);
 367             ++nextExpectedIndex;
 368         }
 369
 370         UChar32 c = utext_next32(tstr);
 371         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 372         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 373                            u_isUAlphabetic(c),
 374                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 375                            u_isalnum(c),
 376                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 377                                                   u_charType(c),
 378                                                   U_SHORT_PROPERTY_NAME),
 379                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 380                                                   u_getIntPropertyValue(c,
 381                                                           UCHAR_WORD_BREAK),
 382                                                   U_SHORT_PROPERTY_NAME),
 383                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 384                                    u_getIntPropertyValue(c,
 385                                            UCHAR_SENTENCE_BREAK),
 386                                    U_SHORT_PROPERTY_NAME),
 387                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 388                                    u_getIntPropertyValue(c,
 389                                            UCHAR_LINE_BREAK),
 390                                    U_SHORT_PROPERTY_NAME),
 391                            name);
 392     }
 393 }
 394
 395
 396 static void printStringBreaks(const UnicodeString &ustr, int expected[], int expectedCount) {
 397    UErrorCode status = U_ZERO_ERROR;
 398    UText *tstr = NULL;
 399    tstr = utext_openConstUnicodeString(NULL, &ustr, &status);
 400    if (U_FAILURE(status)) {
 401        printf("printStringBreaks, utext_openConstUnicodeString() returns %s\n", u_errorName(status));
 402        return;
 403     }
 404    printStringBreaks(tstr, expected, expectedCount);
 405    utext_close(tstr);
 406 }
 407
 408
 409 void RBBITest::TestBug3818() {
 410     UErrorCode  status = U_ZERO_ERROR;
 411
 412     // Four Thai words...
 413     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 414                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 415     UnicodeString  thaiStr(thaiWordData);
 416
 417     BreakIterator* bi = BreakIterator::createWordInstance(Locale("th"), status);
 418     if (U_FAILURE(status) || bi == NULL) {
 419         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 420         return;
 421     }
 422     bi->setText(thaiStr);
 423
 424     int32_t  startOfSecondWord = bi->following(1);
 425     if (startOfSecondWord != 4) {
 426         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 427             __FILE__, __LINE__, startOfSecondWord);
 428     }
 429     startOfSecondWord = bi->following(0);
 430     if (startOfSecondWord != 4) {
 431         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 432             __FILE__, __LINE__, startOfSecondWord);
 433     }
 434     delete bi;
 435 }
 436
 437 //----------------------------------------------------------------------------
 438 //
 439 // generalIteratorTest      Given a break iterator and a set of test data,
 440 //                          Run the tests and report the results.
 441 //
 442 //----------------------------------------------------------------------------
 443 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 444 {
 445
 446     bi.setText(td.fDataToBreak);
 447
 448     testFirstAndNext(bi, td);
 449
 450     testLastAndPrevious(bi, td);
 451
 452     testFollowing(bi, td);
 453     testPreceding(bi, td);
 454     testIsBoundary(bi, td);
 455     doMultipleSelectionTest(bi, td);
 456 }
 457
 458
 459 //
 460 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 461 //                       kind of loop.
 462 //
 463 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 464 {
 465     UErrorCode  status = U_ZERO_ERROR;
 466     int32_t     p;
 467     int32_t     lastP = -1;
 468     int32_t     tag;
 469
 470     logln("Test first and next");
 471     bi.setText(td.fDataToBreak);
 472     td.clearResults();
 473
 474     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 475         td.fActualBreakPositions.addElement(p, status);  // Save result.
 476         tag = bi.getRuleStatus();
 477         td.fActualTags.addElement(tag, status);
 478         if (p <= lastP) {
 479             // If the iterator is not making forward progress, stop.
 480             //  No need to raise an error here, it'll be detected in the normal check of results.
 481             break;
 482         }
 483         lastP = p;
 484     }
 485     td.checkResults("testFirstAndNext", this);
 486 }
 487
 488
 489 //
 490 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 491 //
 492 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 493 {
 494     UErrorCode  status = U_ZERO_ERROR;
 495     int32_t     p;
 496     int32_t     lastP  = 0x7ffffffe;
 497     int32_t     tag;
 498
 499     logln("Test last and previous");
 500     bi.setText(td.fDataToBreak);
 501     td.clearResults();
 502
 503     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 504         // Save break position.  Insert it at start of vector of results, shoving
 505         //    already-saved results further towards the end.
 506         td.fActualBreakPositions.insertElementAt(p, 0, status);
 507         // bi.previous();   // TODO:  Why does this fix things up????
 508         // bi.next();
 509         tag = bi.getRuleStatus();
 510         td.fActualTags.insertElementAt(tag, 0, status);
 511         if (p >= lastP) {
 512             // If the iterator is not making progress, stop.
 513             //  No need to raise an error here, it'll be detected in the normal check of results.
 514             break;
 515         }
 516         lastP = p;
 517     }
 518     td.checkResults("testLastAndPrevious", this);
 519 }
 520
 521
 522 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 523 {
 524     UErrorCode  status = U_ZERO_ERROR;
 525     int32_t     p;
 526     int32_t     tag;
 527     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 528                                  //   cannot be -1; that is returned for DONE.
 529     int         i;
 530
 531     logln("testFollowing():");
 532     bi.setText(td.fDataToBreak);
 533     td.clearResults();
 534
 535     // Save the starting point, since we won't get that out of following.
 536     p = bi.first();
 537     td.fActualBreakPositions.addElement(p, status);  // Save result.
 538     tag = bi.getRuleStatus();
 539     td.fActualTags.addElement(tag, status);
 540
 541     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 542         p = bi.following(i);
 543         if (p != lastP) {
 544             if (p == RuleBasedBreakIterator::DONE) {
 545                 break;
 546             }
 547             // We've reached a new break position.  Save it.
 548             td.fActualBreakPositions.addElement(p, status);  // Save result.
 549             tag = bi.getRuleStatus();
 550             td.fActualTags.addElement(tag, status);
 551             lastP = p;
 552         }
 553     }
 554     // The loop normally exits by means of the break in the middle.
 555     // Make sure that the index was at the correct position for the break iterator to have
 556     //   returned DONE.
 557     if (i != td.fDataToBreak.length()) {
 558         errln("testFollowing():  iterator returned DONE prematurely.");
 559     }
 560
 561     // Full check of all results.
 562     td.checkResults("testFollowing", this);
 563 }
 564
 565
 566
 567 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 568     UErrorCode  status = U_ZERO_ERROR;
 569     int32_t     p;
 570     int32_t     tag;
 571     int32_t     lastP  = 0x7ffffffe;
 572     int         i;
 573
 574     logln("testPreceding():");
 575     bi.setText(td.fDataToBreak);
 576     td.clearResults();
 577
 578     p = bi.last();
 579     td.fActualBreakPositions.addElement(p, status);
 580     tag = bi.getRuleStatus();
 581     td.fActualTags.addElement(tag, status);
 582
 583     for (i = td.fDataToBreak.length(); i>=-1; i--) {
 584         p = bi.preceding(i);
 585         if (p != lastP) {
 586             if (p == RuleBasedBreakIterator::DONE) {
 587                 break;
 588             }
 589             // We've reached a new break position.  Save it.
 590             td.fActualBreakPositions.insertElementAt(p, 0, status);
 591             lastP = p;
 592             tag = bi.getRuleStatus();
 593             td.fActualTags.insertElementAt(tag, 0, status);
 594         }
 595     }
 596     // The loop normally exits by means of the break in the middle.
 597     // Make sure that the index was at the correct position for the break iterator to have
 598     //   returned DONE.
 599     if (i != 0) {
 600         errln("testPreceding():  iterator returned DONE prematurely.");
 601     }
 602
 603     // Full check of all results.
 604     td.checkResults("testPreceding", this);
 605 }
 606
 607
 608
 609 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
 610     UErrorCode  status = U_ZERO_ERROR;
 611     int         i;
 612     int32_t     tag;
 613
 614     logln("testIsBoundary():");
 615     bi.setText(td.fDataToBreak);
 616     td.clearResults();
 617
 618     for (i = 0; i <= td.fDataToBreak.length(); i++) {
 619         if (bi.isBoundary(i)) {
 620             td.fActualBreakPositions.addElement(i, status);  // Save result.
 621             tag = bi.getRuleStatus();
 622             td.fActualTags.addElement(tag, status);
 623         }
 624     }
 625     td.checkResults("testIsBoundary: ", this);
 626 }
 627
 628
 629
 630 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
 631 {
 632     iterator.setText(td.fDataToBreak);
 633
 634     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
 635     int32_t offset = iterator.first();
 636     int32_t testOffset;
 637     int32_t count = 0;
 638
 639     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
 640
 641     if (*testIterator != iterator)
 642         errln("clone() or operator!= failed: two clones compared unequal");
 643
 644     do {
 645         testOffset = testIterator->first();
 646         testOffset = testIterator->next(count);
 647         if (offset != testOffset)
 648             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 649
 650         if (offset != RuleBasedBreakIterator::DONE) {
 651             count++;
 652             offset = iterator.next();
 653
 654             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
 655                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
 656                 if (count > 10000 || offset == -1) {
 657                     errln("operator== failed too many times. Stopping test.");
 658                     if (offset == -1) {
 659                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
 660                     }
 661                     return;
 662                 }
 663             }
 664         }
 665     } while (offset != RuleBasedBreakIterator::DONE);
 666
 667     // now do it backwards...
 668     offset = iterator.last();
 669     count = 0;
 670
 671     do {
 672         testOffset = testIterator->last();
 673         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
 674         if (offset != testOffset)
 675             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
 676
 677         if (offset != RuleBasedBreakIterator::DONE) {
 678             count--;
 679             offset = iterator.previous();
 680         }
 681     } while (offset != RuleBasedBreakIterator::DONE);
 682
 683     delete testIterator;
 684 }
 685
 686
 687 //---------------------------------------------
 688 //
 689 //     other tests
 690 //
 691 //---------------------------------------------
 692 void RBBITest::TestEmptyString()
 693 {
 694     UnicodeString text = "";
 695     UErrorCode status = U_ZERO_ERROR;
 696
 697     BITestData x(status);
 698     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
 699     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
 700     if (U_FAILURE(status))
 701     {
 702         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
 703         return;
 704     }
 705     generalIteratorTest(*bi, x);
 706     delete bi;
 707 }
 708
 709 void RBBITest::TestGetAvailableLocales()
 710 {
 711     int32_t locCount = 0;
 712     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
 713
 714     if (locCount == 0)
 715         dataerrln("getAvailableLocales() returned an empty list!");
 716     // Just make sure that it's returning good memory.
 717     int32_t i;
 718     for (i = 0; i < locCount; ++i) {
 719         logln(locList[i].getName());
 720     }
 721 }
 722
 723 //Testing the BreakIterator::getDisplayName() function
 724 void RBBITest::TestGetDisplayName()
 725 {
 726     UnicodeString   result;
 727
 728     BreakIterator::getDisplayName(Locale::getUS(), result);
 729     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
 730         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
 731                 + result);
 732
 733     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
 734     if (result != "French (France)")
 735         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
 736                 + result);
 737 }
 738 /**
 739  * Test End Behaviour
 740  * @bug 4068137
 741  */
 742 void RBBITest::TestEndBehaviour()
 743 {
 744     UErrorCode status = U_ZERO_ERROR;
 745     UnicodeString testString("boo.");
 746     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
 747     if (U_FAILURE(status))
 748     {
 749         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
 750         return;
 751     }
 752     wb->setText(testString);
 753
 754     if (wb->first() != 0)
 755         errln("Didn't get break at beginning of string.");
 756     if (wb->next() != 3)
 757         errln("Didn't get break before period in \"boo.\"");
 758     if (wb->current() != 4 && wb->next() != 4)
 759         errln("Didn't get break at end of string.");
 760     delete wb;
 761 }
 762 /*
 763  * @bug 4153072
 764  */
 765 void RBBITest::TestBug4153072() {
 766     UErrorCode status = U_ZERO_ERROR;
 767     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
 768     if (U_FAILURE(status))
 769     {
 770         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
 771         return;
 772     }
 773     UnicodeString str("...Hello, World!...");
 774     int32_t begin = 3;
 775     int32_t end = str.length() - 3;
 776     UBool onBoundary;
 777
 778     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
 779     iter->adoptText(textIterator);
 780     int index;
 781     // Note: with the switch to UText, there is no way to restrict the
 782     //       iteration range to begin at an index other than zero.
 783     //       String character iterators created with a non-zero bound are
 784     //         treated by RBBI as being empty.
 785     for (index = -1; index < begin + 1; ++index) {
 786         onBoundary = iter->isBoundary(index);
 787         if (index == 0?  !onBoundary : onBoundary) {
 788             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
 789                             " and begin index = " + begin);
 790         }
 791     }
 792     delete iter;
 793 }
 794
 795
 796 //
 797 // Test for problem reported by Ashok Matoria on 9 July 2007
 798 //    One.<kSoftHyphen><kSpace>Two.
 799 //
 800 //    Sentence break at start (0) and then on calling next() it breaks at
 801 //   'T' of "Two". Now, at this point if I do next() and
 802 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
 803 //
 804 void RBBITest::TestBug5775() {
 805     UErrorCode status = U_ZERO_ERROR;
 806     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
 807     TEST_ASSERT_SUCCESS(status);
 808     if (U_FAILURE(status)) {
 809         return;
 810     }
 811 // Check for status first for better handling of no data errors.
 812     TEST_ASSERT(bi != NULL);
 813     if (bi == NULL) {
 814         return;
 815     }
 816
 817     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
 818     //               01234      56789
 819     s = s.unescape();
 820     bi->setText(s);
 821     int pos = bi->next();
 822     TEST_ASSERT(pos == 6);
 823     pos = bi->next();
 824     TEST_ASSERT(pos == 10);
 825     pos = bi->previous();
 826     TEST_ASSERT(pos == 6);
 827     delete bi;
 828 }
 829
 830
 831
 832 //------------------------------------------------------------------------------
 833 //
 834 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
 835 //
 836 //------------------------------------------------------------------------------
 837
 838 struct TestParams {
 839     BreakIterator   *bi;                   // Break iterator is set while parsing test source.
 840                                            //   Changed out whenever test data changes break type.
 841
 842     UnicodeString    dataToBreak;          // Data that is built up while parsing the test.
 843     UVector32       *expectedBreaks;       // Expected break positions, matches dataToBreak UnicodeString.
 844     UVector32       *srcLine;              // Positions in source file, indexed same as dataToBreak.
 845     UVector32       *srcCol;
 846
 847     UText           *textToBreak;          // UText, could be UTF8 or UTF16.
 848     UVector32       *textMap;              // Map from UTF-16 dataToBreak offsets to UText offsets.
 849     CharString       utf8String;           // UTF-8 form of text to break.
 850
 851     TestParams(UErrorCode &status) : dataToBreak() {
 852         bi               = NULL;
 853         expectedBreaks   = new UVector32(status);
 854         srcLine          = new UVector32(status);
 855         srcCol           = new UVector32(status);
 856         textToBreak      = NULL;
 857         textMap          = new UVector32(status);
 858     }
 859
 860     ~TestParams() {
 861         delete bi;
 862         delete expectedBreaks;
 863         delete srcLine;
 864         delete srcCol;
 865         utext_close(textToBreak);
 866         delete textMap;
 867     }
 868
 869     int32_t getSrcLine(int32_t bp);
 870     int32_t getExpectedBreak(int32_t bp);
 871     int32_t getSrcCol(int32_t bp);
 872
 873     void setUTF16(UErrorCode &status);
 874     void setUTF8(UErrorCode &status);
 875 };
 876
 877 // Append a UnicodeString to a CharString with UTF-8 encoding.
 878 // Substitute any invalid chars.
 879 //   Note: this is used with test data that includes a few unpaired surrogates in the UTF-16 that will be substituted.
 880 static void CharStringAppend(CharString &dest, const UnicodeString &src, UErrorCode &status) {
 881     if (U_FAILURE(status)) {
 882         return;
 883     }
 884     int32_t utf8Length;
 885     u_strToUTF8WithSub(NULL, 0, &utf8Length,            // Output Buffer, NULL for preflight.
 886                        src.getBuffer(), src.length(),   // UTF-16 data
 887                        0xfffd, NULL,                    // Substitution char, number of subs.
 888                        &status);
 889     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
 890         return;
 891     }
 892     status = U_ZERO_ERROR;
 893     int32_t capacity;
 894     char *buffer = dest.getAppendBuffer(utf8Length, utf8Length, capacity, status);
 895     u_strToUTF8WithSub(buffer, utf8Length, NULL,
 896                        src.getBuffer(), src.length(),
 897                        0xfffd, NULL, &status);
 898     dest.append(buffer, utf8Length, status);
 899 }
 900
 901
 902 void TestParams::setUTF16(UErrorCode &status) {
 903     textToBreak = utext_openUnicodeString(textToBreak, &dataToBreak, &status);
 904     textMap->removeAllElements();
 905     for (int32_t i=0; i<dataToBreak.length(); i++) {
 906         if (i == dataToBreak.getChar32Start(i)) {
 907             textMap->addElement(i, status);
 908         } else {
 909             textMap->addElement(-1, status);
 910         }
 911     }
 912     textMap->addElement(dataToBreak.length(), status);
 913     U_ASSERT(dataToBreak.length() + 1 == textMap->size());
 914 }
 915
 916
 917 void TestParams::setUTF8(UErrorCode &status) {
 918     if (U_FAILURE(status)) {
 919         return;
 920     }
 921     utf8String.clear();
 922     CharStringAppend(utf8String, dataToBreak, status);
 923     textToBreak = utext_openUTF8(textToBreak, utf8String.data(), utf8String.length(), &status);
 924     if (U_FAILURE(status)) {
 925         return;
 926     }
 927
 928     textMap->removeAllElements();
 929     int32_t utf16Index = 0;
 930     for (;;) {
 931         textMap->addElement(utf16Index, status);
 932         UChar32 c32 = utext_current32(textToBreak);
 933         if (c32 < 0) {
 934             break;
 935         }
 936         utf16Index += U16_LENGTH(c32);
 937         utext_next32(textToBreak);
 938         while (textMap->size() < utext_getNativeIndex(textToBreak)) {
 939             textMap->addElement(-1, status);
 940         }
 941     }
 942     U_ASSERT(utext_nativeLength(textToBreak) + 1 == textMap->size());
 943 }
 944
 945
 946 int32_t TestParams::getSrcLine(int bp) {
 947     if (bp >= textMap->size()) {
 948         bp = textMap->size() - 1;
 949     }
 950     int32_t i = 0;
 951     for(; bp >= 0 ; --bp) {
 952         // Move to a character boundary if we are not on one already.
 953         i = textMap->elementAti(bp);
 954         if (i >= 0) {
 955             break;
 956         }
 957     }
 958     return srcLine->elementAti(i);
 959 }
 960
 961
 962 int32_t TestParams::getExpectedBreak(int bp) {
 963     if (bp >= textMap->size()) {
 964         return 0;
 965     }
 966     int32_t i = textMap->elementAti(bp);
 967     int32_t retVal = 0;
 968     if (i >= 0) {
 969         retVal = expectedBreaks->elementAti(i);
 970     }
 971     return retVal;
 972 }
 973
 974
 975 int32_t TestParams::getSrcCol(int bp) {
 976     if (bp >= textMap->size()) {
 977         bp = textMap->size() - 1;
 978     }
 979     int32_t i = 0;
 980     for(; bp >= 0; --bp) {
 981         // Move bp to a character boundary if we are not on one already.
 982         i = textMap->elementAti(bp);
 983         if (i >= 0) {
 984             break;
 985         }
 986     }
 987     return srcCol->elementAti(i);
 988 }
 989
 990
 991 void RBBITest::executeTest(TestParams *t, UErrorCode &status) {
 992     int32_t    bp;
 993     int32_t    prevBP;
 994     int32_t    i;
 995
 996     TEST_ASSERT_SUCCESS(status);
 997     if (U_FAILURE(status)) {
 998         return;
 999     }
1000
1001     if (t->bi == NULL) {
1002         return;
1003     }
1004
1005     t->bi->setText(t->textToBreak, status);
1006     //
1007     //  Run the iterator forward
1008     //
1009     prevBP = -1;
1010     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1011         if (prevBP ==  bp) {
1012             // Fail for lack of forward progress.
1013             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1014                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1015             break;
1016         }
1017
1018         // Check that there we didn't miss an expected break between the last one
1019         //  and this one.
1020         for (i=prevBP+1; i<bp; i++) {
1021             if (t->getExpectedBreak(i) != 0) {
1022                 int expected[] = {0, i};
1023                 printStringBreaks(t->dataToBreak, expected, 2);
1024                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1025                       i, t->getSrcLine(i), t->getSrcCol(i));
1026             }
1027         }
1028
1029         // Check that the break we did find was expected
1030         if (t->getExpectedBreak(bp) == 0) {
1031             int expected[] = {0, bp};
1032             printStringBreaks(t->textToBreak, expected, 2);
1033             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1034                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1035         } else {
1036             // The break was expected.
1037             //   Check that the {nnn} tag value is correct.
1038             int32_t expectedTagVal = t->getExpectedBreak(bp);
1039             if (expectedTagVal == -1) {
1040                 expectedTagVal = 0;
1041             }
1042             int32_t line = t->getSrcLine(bp);
1043             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1044             if (rs != expectedTagVal) {
1045                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1046                       "          Actual, Expected status = %4d, %4d",
1047                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1048             }
1049         }
1050
1051         prevBP = bp;
1052     }
1053
1054     // Verify that there were no missed expected breaks after the last one found
1055     for (i=prevBP+1; i<utext_nativeLength(t->textToBreak); i++) {
1056         if (t->getExpectedBreak(i) != 0) {
1057             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1058                       i, t->getSrcLine(i), t->getSrcCol(i));
1059         }
1060     }
1061
1062     //
1063     //  Run the iterator backwards, verify that the same breaks are found.
1064     //
1065     prevBP = utext_nativeLength(t->textToBreak)+2;  // start with a phony value for the last break pos seen.
1066     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1067         if (prevBP ==  bp) {
1068             // Fail for lack of progress.
1069             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1070                 bp, t->getSrcLine(bp), t->getSrcCol(bp));
1071             break;
1072         }
1073
1074         // Check that we didn't miss an expected break between the last one
1075         //  and this one.  (UVector returns zeros for index out of bounds.)
1076         for (i=prevBP-1; i>bp; i--) {
1077             if (t->getExpectedBreak(i) != 0) {
1078                 errln("Reverse Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1079                       i, t->getSrcLine(i), t->getSrcCol(i));
1080             }
1081         }
1082
1083         // Check that the break we did find was expected
1084         if (t->getExpectedBreak(bp) == 0) {
1085             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1086                    bp, t->getSrcLine(bp), t->getSrcCol(bp));
1087         } else {
1088             // The break was expected.
1089             //   Check that the {nnn} tag value is correct.
1090             int32_t expectedTagVal = t->getExpectedBreak(bp);
1091             if (expectedTagVal == -1) {
1092                 expectedTagVal = 0;
1093             }
1094             int line = t->getSrcLine(bp);
1095             int32_t rs = t->bi->getRuleStatus();
1096             if (rs != expectedTagVal) {
1097                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1098                       "          Actual, Expected status = %4d, %4d",
1099                     bp, line, t->getSrcCol(bp), rs, expectedTagVal);
1100             }
1101         }
1102
1103         prevBP = bp;
1104     }
1105
1106     // Verify that there were no missed breaks prior to the last one found
1107     for (i=prevBP-1; i>=0; i--) {
1108         if (t->getExpectedBreak(i) != 0) {
1109             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1110                       i, t->getSrcLine(i), t->getSrcCol(i));
1111         }
1112     }
1113
1114     // Check isBoundary()
1115     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1116         UBool boundaryExpected = (t->getExpectedBreak(i) != 0);
1117         UBool boundaryFound    = t->bi->isBoundary(i);
1118         if (boundaryExpected != boundaryFound) {
1119             errln("isBoundary(%d) incorrect. File line,col= %4d,%4d\n"
1120                   "        Expected, Actual= %s, %s",
1121                   i, t->getSrcLine(i), t->getSrcCol(i),
1122                   boundaryExpected ? "true":"false", boundaryFound? "true" : "false");
1123         }
1124     }
1125
1126     // Check following()
1127     for (i=0; i < utext_nativeLength(t->textToBreak); i++) {
1128         int32_t actualBreak = t->bi->following(i);
1129         int32_t expectedBreak = BreakIterator::DONE;
1130         for (int32_t j=i+1; j <= utext_nativeLength(t->textToBreak); j++) {
1131             if (t->getExpectedBreak(j) != 0) {
1132                 expectedBreak = j;
1133                 break;
1134             }
1135         }
1136         if (expectedBreak != actualBreak) {
1137             errln("following(%d) incorrect. File line,col= %4d,%4d\n"
1138                   "        Expected, Actual= %d, %d",
1139                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1140         }
1141     }
1142
1143     // Check preceding()
1144     for (i=utext_nativeLength(t->textToBreak); i>=0; i--) {
1145         int32_t actualBreak = t->bi->preceding(i);
1146         int32_t expectedBreak = BreakIterator::DONE;
1147
1148         // For UTF-8 & UTF-16 supplementals, all code units of a character are equivalent.
1149         // preceding(trailing byte) will return the index of some preceding code point,
1150         // not the lead byte of the current code point, even though that has a smaller index.
1151         // Therefore, start looking at the expected break data not at i-1, but at
1152         // the start of code point index - 1.
1153         utext_setNativeIndex(t->textToBreak, i);
1154         int32_t j = utext_getNativeIndex(t->textToBreak) - 1;
1155         for (; j >= 0; j--) {
1156             if (t->getExpectedBreak(j) != 0) {
1157                 expectedBreak = j;
1158                 break;
1159             }
1160         }
1161         if (expectedBreak != actualBreak) {
1162             errln("preceding(%d) incorrect. File line,col= %4d,%4d\n"
1163                   "        Expected, Actual= %d, %d",
1164                   i, t->getSrcLine(i), t->getSrcCol(i), expectedBreak, actualBreak);
1165         }
1166     }
1167 }
1168
1169
1170 void RBBITest::TestExtended() {
1171 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1172     UErrorCode      status  = U_ZERO_ERROR;
1173     Locale          locale("");
1174
1175     UnicodeString       rules;
1176     TestParams          tp(status);
1177
1178     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_@=-]*) *>"), 0, status);
1179     if (U_FAILURE(status)) {
1180         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1181     }
1182
1183
1184     //
1185     //  Open and read the test data file.
1186     //
1187     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1188     char testFileName[1000];
1189     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1190         errln("Can't open test data.  Path too long.");
1191         return;
1192     }
1193     strcpy(testFileName, testDataDirectory);
1194     strcat(testFileName, "rbbitst.txt");
1195
1196     int    len;
1197     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1198     if (U_FAILURE(status)) {
1199         return; /* something went wrong, error already output */
1200     }
1201
1202
1203
1204
1205     //
1206     //  Put the test data into a UnicodeString
1207     //
1208     UnicodeString testString(FALSE, testFile, len);
1209
1210     enum EParseState{
1211         PARSE_COMMENT,
1212         PARSE_TAG,
1213         PARSE_DATA,
1214         PARSE_NUM
1215     }
1216     parseState = PARSE_TAG;
1217
1218     EParseState savedState = PARSE_TAG;
1219
1220     static const UChar CH_LF        = 0x0a;
1221     static const UChar CH_CR        = 0x0d;
1222     static const UChar CH_HASH      = 0x23;
1223     /*static const UChar CH_PERIOD    = 0x2e;*/
1224     static const UChar CH_LT        = 0x3c;
1225     static const UChar CH_GT        = 0x3e;
1226     static const UChar CH_BACKSLASH = 0x5c;
1227     static const UChar CH_BULLET    = 0x2022;
1228
1229     int32_t    lineNum  = 1;
1230     int32_t    colStart = 0;
1231     int32_t    column   = 0;
1232     int32_t    charIdx  = 0;
1233
1234     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1235
1236     for (charIdx = 0; charIdx < len; ) {
1237         status = U_ZERO_ERROR;
1238         UChar  c = testString.charAt(charIdx);
1239         charIdx++;
1240         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1241             // treat CRLF as a unit
1242             c = CH_LF;
1243             charIdx++;
1244         }
1245         if (c == CH_LF || c == CH_CR) {
1246             lineNum++;
1247             colStart = charIdx;
1248         }
1249         column = charIdx - colStart + 1;
1250
1251         switch (parseState) {
1252         case PARSE_COMMENT:
1253             if (c == 0x0a || c == 0x0d) {
1254                 parseState = savedState;
1255             }
1256             break;
1257
1258         case PARSE_TAG:
1259             {
1260             if (c == CH_HASH) {
1261                 parseState = PARSE_COMMENT;
1262                 savedState = PARSE_TAG;
1263                 break;
1264             }
1265             if (u_isUWhiteSpace(c)) {
1266                 break;
1267             }
1268             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1269                 delete tp.bi;
1270                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1271                 charIdx += 5;
1272                 break;
1273             }
1274             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1275                 delete tp.bi;
1276                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1277                 charIdx += 5;
1278                 break;
1279             }
1280             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1281                 delete tp.bi;
1282                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1283                 charIdx += 5;
1284                 break;
1285             }
1286             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1287                 delete tp.bi;
1288                 tp.bi = NULL;
1289                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1290                 charIdx += 5;
1291                 break;
1292             }
1293             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1294                 delete tp.bi;
1295                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1296                 charIdx += 6;
1297                 break;
1298             }
1299
1300             // <locale  loc_name>
1301             localeMatcher.reset(testString);
1302             if (localeMatcher.lookingAt(charIdx-1, status)) {
1303                 UnicodeString localeName = localeMatcher.group(1, status);
1304                 char localeName8[100];
1305                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1306                 locale = Locale::createFromName(localeName8);
1307                 charIdx += localeMatcher.group(0, status).length() - 1;
1308                 TEST_ASSERT_SUCCESS(status);
1309                 break;
1310             }
1311             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1312                 parseState = PARSE_DATA;
1313                 charIdx += 5;
1314                 tp.dataToBreak = "";
1315                 tp.expectedBreaks->removeAllElements();
1316                 tp.srcCol ->removeAllElements();
1317                 tp.srcLine->removeAllElements();
1318                 break;
1319             }
1320
1321             errln("line %d: Tag expected in test file.", lineNum);
1322             parseState = PARSE_COMMENT;
1323             savedState = PARSE_DATA;
1324             goto end_test; // Stop the test.
1325             }
1326             break;
1327
1328         case PARSE_DATA:
1329             if (c == CH_BULLET) {
1330                 int32_t  breakIdx = tp.dataToBreak.length();
1331                 tp.expectedBreaks->setSize(breakIdx+1);
1332                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1333                 tp.srcLine->setSize(breakIdx+1);
1334                 tp.srcLine->setElementAt(lineNum, breakIdx);
1335                 tp.srcCol ->setSize(breakIdx+1);
1336                 tp.srcCol ->setElementAt(column, breakIdx);
1337                 break;
1338             }
1339
1340             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1341                 // Add final entry to mappings from break location to source file position.
1342                 //  Need one extra because last break position returned is after the
1343                 //    last char in the data, not at the last char.
1344                 tp.srcLine->addElement(lineNum, status);
1345                 tp.srcCol ->addElement(column, status);
1346
1347                 parseState = PARSE_TAG;
1348                 charIdx += 6;
1349
1350                 // RUN THE TEST!
1351                 status = U_ZERO_ERROR;
1352                 tp.setUTF16(status);
1353                 executeTest(&tp, status);
1354                 TEST_ASSERT_SUCCESS(status);
1355
1356                 // Run again, this time with UTF-8 text wrapped in a UText.
1357                 status = U_ZERO_ERROR;
1358                 tp.setUTF8(status);
1359                 TEST_ASSERT_SUCCESS(status);
1360                 executeTest(&tp, status);
1361                 break;
1362             }
1363
1364             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1365                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1366                 // Get the code point from the name and insert it into the test data.
1367                 //   (Damn, no API takes names in Unicode  !!!
1368                 //    we've got to take it back to char *)
1369                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1370                 int32_t nameLength = nameEndIdx - (charIdx+2);
1371                 char charNameBuf[200];
1372                 UChar32 theChar = -1;
1373                 if (nameEndIdx != -1) {
1374                     UErrorCode status = U_ZERO_ERROR;
1375                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1376                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1377                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1378                     if (U_FAILURE(status)) {
1379                         theChar = -1;
1380                     }
1381                 }
1382                 if (theChar == -1) {
1383                     errln("Error in named character in test file at line %d, col %d",
1384                         lineNum, column);
1385                 } else {
1386                     // Named code point was recognized.  Insert it
1387                     //   into the test data.
1388                     tp.dataToBreak.append(theChar);
1389                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1390                         tp.srcLine->addElement(lineNum, status);
1391                         tp.srcCol ->addElement(column, status);
1392                     }
1393                 }
1394                 if (nameEndIdx > charIdx) {
1395                     charIdx = nameEndIdx+1;
1396
1397                 }
1398                 break;
1399             }
1400
1401
1402
1403
1404             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1405                 charIdx++;
1406                 int32_t  breakIdx = tp.dataToBreak.length();
1407                 tp.expectedBreaks->setSize(breakIdx+1);
1408                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1409                 tp.srcLine->setSize(breakIdx+1);
1410                 tp.srcLine->setElementAt(lineNum, breakIdx);
1411                 tp.srcCol ->setSize(breakIdx+1);
1412                 tp.srcCol ->setElementAt(column, breakIdx);
1413                 break;
1414             }
1415
1416             if (c == CH_LT) {
1417                 tagValue   = 0;
1418                 parseState = PARSE_NUM;
1419                 break;
1420             }
1421
1422             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1423                 parseState = PARSE_COMMENT;
1424                 savedState = PARSE_DATA;
1425                 break;
1426             }
1427
1428             if (c == CH_BACKSLASH) {
1429                 // Check for \ at end of line, a line continuation.
1430                 //     Advance over (discard) the newline
1431                 UChar32 cp = testString.char32At(charIdx);
1432                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1433                     // We have a CR LF
1434                     //  Need an extra increment of the input ptr to move over both of them
1435                     charIdx++;
1436                 }
1437                 if (cp == CH_LF || cp == CH_CR) {
1438                     lineNum++;
1439                     colStart = charIdx;
1440                     charIdx++;
1441                     break;
1442                 }
1443
1444                 // Let unescape handle the back slash.
1445                 cp = testString.unescapeAt(charIdx);
1446                 if (cp != -1) {
1447                     // Escape sequence was recognized.  Insert the char
1448                     //   into the test data.
1449                     tp.dataToBreak.append(cp);
1450                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1451                         tp.srcLine->addElement(lineNum, status);
1452                         tp.srcCol ->addElement(column, status);
1453                     }
1454                     break;
1455                 }
1456
1457
1458                 // Not a recognized backslash escape sequence.
1459                 // Take the next char as a literal.
1460                 //  TODO:  Should this be an error?
1461                 c = testString.charAt(charIdx);
1462                 charIdx = testString.moveIndex32(charIdx, 1);
1463             }
1464
1465             // Normal, non-escaped data char.
1466             tp.dataToBreak.append(c);
1467
1468             // Save the mapping from offset in the data to line/column numbers in
1469             //   the original input file.  Will be used for better error messages only.
1470             //   If there's an expected break before this char, the slot in the mapping
1471             //     vector will already be set for this char; don't overwrite it.
1472             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1473                 tp.srcLine->addElement(lineNum, status);
1474                 tp.srcCol ->addElement(column, status);
1475             }
1476             break;
1477
1478
1479         case PARSE_NUM:
1480             // We are parsing an expected numeric tag value, like <1234>,
1481             //   within a chunk of data.
1482             if (u_isUWhiteSpace(c)) {
1483                 break;
1484             }
1485
1486             if (c == CH_GT) {
1487                 // Finished the number.  Add the info to the expected break data,
1488                 //   and switch parse state back to doing plain data.
1489                 parseState = PARSE_DATA;
1490                 if (tagValue == 0) {
1491                     tagValue = -1;
1492                 }
1493                 int32_t  breakIdx = tp.dataToBreak.length();
1494                 tp.expectedBreaks->setSize(breakIdx+1);
1495                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1496                 tp.srcLine->setSize(breakIdx+1);
1497                 tp.srcLine->setElementAt(lineNum, breakIdx);
1498                 tp.srcCol ->setSize(breakIdx+1);
1499                 tp.srcCol ->setElementAt(column, breakIdx);
1500                 break;
1501             }
1502
1503             if (u_isdigit(c)) {
1504                 tagValue = tagValue*10 + u_charDigitValue(c);
1505                 break;
1506             }
1507
1508             errln("Syntax Error in test file at line %d, col %d",
1509                 lineNum, column);
1510             parseState = PARSE_COMMENT;
1511             goto end_test; // Stop the test
1512             break;
1513         }
1514
1515
1516         if (U_FAILURE(status)) {
1517             dataerrln("ICU Error %s while parsing test file at line %d.",
1518                 u_errorName(status), lineNum);
1519             status = U_ZERO_ERROR;
1520             goto end_test; // Stop the test
1521         }
1522
1523     }
1524
1525 end_test:
1526     delete [] testFile;
1527 #endif
1528 }
1529
1530
1531 //-------------------------------------------------------------------------------
1532 //
1533 //  TestDictRules   create a break iterator from source rules that includes a
1534 //                  dictionary range.   Regression for bug #7130.  Source rules
1535 //                  do not declare a break iterator type (word, line, sentence, etc.
1536 //                  but the dictionary code, without a type, would loop.
1537 //
1538 //-------------------------------------------------------------------------------
1539 void RBBITest::TestDictRules() {
1540     const char *rules =  "$dictionary = [a-z]; \n"
1541                          "!!forward; \n"
1542                          "$dictionary $dictionary; \n"
1543                          "!!reverse; \n"
1544                          "$dictionary $dictionary; \n";
1545     const char *text = "aa";
1546     UErrorCode status = U_ZERO_ERROR;
1547     UParseError parseError;
1548
1549     RuleBasedBreakIterator bi(rules, parseError, status);
1550     if (U_SUCCESS(status)) {
1551         UnicodeString utext = text;
1552         bi.setText(utext);
1553         int32_t position;
1554         int32_t loops;
1555         for (loops = 0; loops<10; loops++) {
1556             position = bi.next();
1557             if (position == RuleBasedBreakIterator::DONE) {
1558                 break;
1559             }
1560         }
1561         TEST_ASSERT(loops == 1);
1562     } else {
1563         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
1564     }
1565 }
1566
1567
1568
1569 //-------------------------------------------------------------------------------
1570 //
1571 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
1572 //    return the datain one big UChar * buffer, which the caller must delete.
1573 //
1574 //    parameters:
1575 //          fileName:   the name of the file, with no directory part.  The test data directory
1576 //                      is assumed.
1577 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
1578 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
1579 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
1580 //                      Pass NULL for the system default encoding.
1581 //          status
1582 //    returns:
1583 //                      The file data, converted to UChar.
1584 //                      The caller must delete this when done with
1585 //                           delete [] theBuffer;
1586 //
1587 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
1588 //           Move this function to some common place.
1589 //
1590 //--------------------------------------------------------------------------------
1591 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
1592     UChar       *retPtr  = NULL;
1593     char        *fileBuf = NULL;
1594     UConverter* conv     = NULL;
1595     FILE        *f       = NULL;
1596
1597     ulen = 0;
1598     if (U_FAILURE(status)) {
1599         return retPtr;
1600     }
1601
1602     //
1603     //  Open the file.
1604     //
1605     f = fopen(fileName, "rb");
1606     if (f == 0) {
1607         dataerrln("Error opening test data file %s\n", fileName);
1608         status = U_FILE_ACCESS_ERROR;
1609         return NULL;
1610     }
1611     //
1612     //  Read it in
1613     //
1614     int   fileSize;
1615     int   amt_read;
1616
1617     fseek( f, 0, SEEK_END);
1618     fileSize = ftell(f);
1619     fileBuf = new char[fileSize];
1620     fseek(f, 0, SEEK_SET);
1621     amt_read = fread(fileBuf, 1, fileSize, f);
1622     if (amt_read != fileSize || fileSize <= 0) {
1623         errln("Error reading test data file.");
1624         goto cleanUpAndReturn;
1625     }
1626
1627     //
1628     // Look for a Unicode Signature (BOM) on the data just read
1629     //
1630     int32_t        signatureLength;
1631     const char *   fileBufC;
1632     const char*    bomEncoding;
1633
1634     fileBufC = fileBuf;
1635     bomEncoding = ucnv_detectUnicodeSignature(
1636         fileBuf, fileSize, &signatureLength, &status);
1637     if(bomEncoding!=NULL ){
1638         fileBufC  += signatureLength;
1639         fileSize  -= signatureLength;
1640         encoding = bomEncoding;
1641     }
1642
1643     //
1644     // Open a converter to take the rule file to UTF-16
1645     //
1646     conv = ucnv_open(encoding, &status);
1647     if (U_FAILURE(status)) {
1648         goto cleanUpAndReturn;
1649     }
1650
1651     //
1652     // Convert the rules to UChar.
1653     //  Preflight first to determine required buffer size.
1654     //
1655     ulen = ucnv_toUChars(conv,
1656         NULL,           //  dest,
1657         0,              //  destCapacity,
1658         fileBufC,
1659         fileSize,
1660         &status);
1661     if (status == U_BUFFER_OVERFLOW_ERROR) {
1662         // Buffer Overflow is expected from the preflight operation.
1663         status = U_ZERO_ERROR;
1664
1665         retPtr = new UChar[ulen+1];
1666         ucnv_toUChars(conv,
1667             retPtr,       //  dest,
1668             ulen+1,
1669             fileBufC,
1670             fileSize,
1671             &status);
1672     }
1673
1674 cleanUpAndReturn:
1675     fclose(f);
1676     delete []fileBuf;
1677     ucnv_close(conv);
1678     if (U_FAILURE(status)) {
1679         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
1680         delete []retPtr;
1681         retPtr = 0;
1682         ulen   = 0;
1683     };
1684     return retPtr;
1685 }
1686
1687
1688
1689 //--------------------------------------------------------------------------------------------
1690 //
1691 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
1692 //
1693 //-------------------------------------------------------------------------------------------
1694 void RBBITest::TestUnicodeFiles() {
1695     RuleBasedBreakIterator  *bi;
1696     UErrorCode               status = U_ZERO_ERROR;
1697
1698     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
1699     TEST_ASSERT_SUCCESS(status);
1700     if (U_SUCCESS(status)) {
1701         runUnicodeTestData("GraphemeBreakTest.txt", bi);
1702     }
1703     delete bi;
1704
1705     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
1706     TEST_ASSERT_SUCCESS(status);
1707     if (U_SUCCESS(status)) {
1708         runUnicodeTestData("WordBreakTest.txt", bi);
1709     }
1710     delete bi;
1711
1712     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1713     TEST_ASSERT_SUCCESS(status);
1714     if (U_SUCCESS(status)) {
1715         runUnicodeTestData("SentenceBreakTest.txt", bi);
1716     }
1717     delete bi;
1718
1719     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
1720     TEST_ASSERT_SUCCESS(status);
1721     if (U_SUCCESS(status)) {
1722         runUnicodeTestData("LineBreakTest.txt", bi);
1723     }
1724     delete bi;
1725 }
1726
1727
1728 // Check for test cases from the Unicode test data files that are known to fail
1729 // and should be skipped because ICU is not yet able to fully implement the spec.
1730 // See ticket #7270.
1731
1732 UBool RBBITest::testCaseIsKnownIssue(const UnicodeString &testCase, const char *fileName) {
1733     static const UChar badTestCases[][4] = {                     // Line Numbers from Unicode 7.0.0 file.
1734         {(UChar)0x200B, (UChar)0x0020, (UChar)0x007D, (UChar)0x0000},   // Line 5198
1735         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0029, (UChar)0x0000},   // Line 5202
1736         {(UChar)0x200B, (UChar)0x0020, (UChar)0x0021, (UChar)0x0000},   // Line 5214
1737         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002c, (UChar)0x0000},   // Line 5246
1738         {(UChar)0x200B, (UChar)0x0020, (UChar)0x002f, (UChar)0x0000},   // Line 5298
1739         {(UChar)0x200B, (UChar)0x0020, (UChar)0x2060, (UChar)0x0000}    // Line 5302
1740     };
1741     if (strcmp(fileName, "LineBreakTest.txt") != 0) {
1742         return FALSE;
1743     }
1744
1745     for (int i=0; i<UPRV_LENGTHOF(badTestCases); i++) {
1746         if (testCase == UnicodeString(badTestCases[i])) {
1747             return logKnownIssue("7270");
1748         }
1749     }
1750     return FALSE;
1751 }
1752
1753
1754 //--------------------------------------------------------------------------------------------
1755 //
1756 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
1757 //
1758 //-------------------------------------------------------------------------------------------
1759 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
1760 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1761     UErrorCode  status = U_ZERO_ERROR;
1762
1763     //
1764     //  Open and read the test data file, put it into a UnicodeString.
1765     //
1766     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1767     char testFileName[1000];
1768     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1769         dataerrln("Can't open test data.  Path too long.");
1770         return;
1771     }
1772     strcpy(testFileName, testDataDirectory);
1773     strcat(testFileName, fileName);
1774
1775     logln("Opening data file %s\n", fileName);
1776
1777     int    len;
1778     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1779     if (status != U_FILE_ACCESS_ERROR) {
1780         TEST_ASSERT_SUCCESS(status);
1781         TEST_ASSERT(testFile != NULL);
1782     }
1783     if (U_FAILURE(status) || testFile == NULL) {
1784         return; /* something went wrong, error already output */
1785     }
1786     UnicodeString testFileAsString(TRUE, testFile, len);
1787
1788     //
1789     //  Parse the test data file using a regular expression.
1790     //  Each kind of token is recognized in its own capture group; what type of item was scanned
1791     //     is identified by which group had a match.
1792     //
1793     //    Caputure Group #                  1          2            3            4           5
1794     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
1795     //
1796     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
1797     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
1798     UnicodeString   testString;
1799     UVector32       breakPositions(status);
1800     int             lineNumber = 1;
1801     TEST_ASSERT_SUCCESS(status);
1802     if (U_FAILURE(status)) {
1803         return;
1804     }
1805
1806     //
1807     //  Scan through each test case, building up the string to be broken in testString,
1808     //   and the positions that should be boundaries in the breakPositions vector.
1809     //
1810     int spin = 0;
1811     while (tokenMatcher.find()) {
1812         if(tokenMatcher.hitEnd()) {
1813           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
1814              This occurred when the text file was corrupt (wasn't marked as UTF-8)
1815              and caused an infinite loop here on EBCDIC systems!
1816           */
1817           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
1818           //       return;
1819         }
1820         if (tokenMatcher.start(1, status) >= 0) {
1821             // Scanned a divide sign, indicating a break position in the test data.
1822             if (testString.length()>0) {
1823                 breakPositions.addElement(testString.length(), status);
1824             }
1825         }
1826         else if (tokenMatcher.start(2, status) >= 0) {
1827             // Scanned an 'x', meaning no break at this position in the test data
1828             //   Nothing to be done here.
1829             }
1830         else if (tokenMatcher.start(3, status) >= 0) {
1831             // Scanned Hex digits.  Convert them to binary, append to the character data string.
1832             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
1833             int length = hexNumber.length();
1834             if (length<=8) {
1835                 char buf[10];
1836                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
1837                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
1838                 if (c<=0x10ffff) {
1839                     testString.append(c);
1840                 } else {
1841                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
1842                        fileName, lineNumber);
1843                 }
1844             } else {
1845                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
1846                        fileName, lineNumber);
1847              }
1848         }
1849         else if (tokenMatcher.start(4, status) >= 0) {
1850             // Scanned to end of a line, possibly skipping over a comment in the process.
1851             //   If the line from the file contained test data, run the test now.
1852             if (testString.length() > 0 && !testCaseIsKnownIssue(testString, fileName)) {
1853                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
1854             }
1855
1856             // Clear out this test case.
1857             //    The string and breakPositions vector will be refilled as the next
1858             //       test case is parsed.
1859             testString.remove();
1860             breakPositions.removeAllElements();
1861             lineNumber++;
1862         } else {
1863             // Scanner catchall.  Something unrecognized appeared on the line.
1864             char token[16];
1865             UnicodeString uToken = tokenMatcher.group(0, status);
1866             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
1867             token[sizeof(token)-1] = 0;
1868             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
1869
1870             // Clean up, in preparation for continuing with the next line.
1871             testString.remove();
1872             breakPositions.removeAllElements();
1873             lineNumber++;
1874         }
1875         TEST_ASSERT_SUCCESS(status);
1876         if (U_FAILURE(status)) {
1877             break;
1878         }
1879     }
1880
1881     delete [] testFile;
1882  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
1883 }
1884
1885 //--------------------------------------------------------------------------------------------
1886 //
1887 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
1888 //                            test data files.  Do only a simple, forward-only check -
1889 //                            this test is mostly to check that ICU and the Unicode
1890 //                            data agree with each other.
1891 //
1892 //--------------------------------------------------------------------------------------------
1893 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
1894                          const UnicodeString &testString,   // Text data to be broken
1895                          UVector32 *breakPositions,         // Positions where breaks should be found.
1896                          RuleBasedBreakIterator *bi) {
1897     int32_t pos;                 // Break Position in the test string
1898     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
1899     int32_t expectedPos;         // Expected break position (index into test string)
1900
1901     bi->setText(testString);
1902     pos = bi->first();
1903     pos = bi->next();
1904
1905     while (pos != BreakIterator::DONE) {
1906         if (expectedI >= breakPositions->size()) {
1907             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1908                 testFileName, lineNumber, pos);
1909             break;
1910         }
1911         expectedPos = breakPositions->elementAti(expectedI);
1912         if (pos < expectedPos) {
1913             errln("Test file \"%s\", line %d, unexpected break found at position %d",
1914                 testFileName, lineNumber, pos);
1915             break;
1916         }
1917         if (pos > expectedPos) {
1918             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1919                 testFileName, lineNumber, expectedPos);
1920             break;
1921         }
1922         pos = bi->next();
1923         expectedI++;
1924     }
1925
1926     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
1927         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
1928             testFileName, lineNumber, breakPositions->elementAti(expectedI));
1929     }
1930 }
1931
1932
1933
1934 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1935 //---------------------------------------------------------------------------------------
1936 //
1937 //   classs RBBIMonkeyKind
1938 //
1939 //      Monkey Test for Break Iteration
1940 //      Abstract interface class.   Concrete derived classes independently
1941 //      implement the break rules for different iterator types.
1942 //
1943 //      The Monkey Test itself uses doesn't know which type of break iterator it is
1944 //      testing, but works purely in terms of the interface defined here.
1945 //
1946 //---------------------------------------------------------------------------------------
1947 class RBBIMonkeyKind {
1948 public:
1949     // Return a UVector of UnicodeSets, representing the character classes used
1950     //   for this type of iterator.
1951     virtual  UVector  *charClasses() = 0;
1952
1953     // Set the test text on which subsequent calls to next() will operate
1954     virtual  void      setText(const UnicodeString &s) = 0;
1955
1956     // Find the next break postion, starting from the prev break position, or from zero.
1957     // Return -1 after reaching end of string.
1958     virtual  int32_t   next(int32_t i) = 0;
1959
1960     virtual ~RBBIMonkeyKind();
1961     UErrorCode       deferredStatus;
1962
1963
1964 protected:
1965     RBBIMonkeyKind();
1966
1967 private:
1968 };
1969
1970 RBBIMonkeyKind::RBBIMonkeyKind() {
1971     deferredStatus = U_ZERO_ERROR;
1972 }
1973
1974 RBBIMonkeyKind::~RBBIMonkeyKind() {
1975 }
1976
1977
1978 //----------------------------------------------------------------------------------------
1979 //
1980 //   Random Numbers.  Similar to standard lib rand() and srand()
1981 //                    Not using library to
1982 //                      1.  Get same results on all platforms.
1983 //                      2.  Get access to current seed, to more easily reproduce failures.
1984 //
1985 //---------------------------------------------------------------------------------------
1986 static uint32_t m_seed = 1;
1987
1988 static uint32_t m_rand()
1989 {
1990     m_seed = m_seed * 1103515245 + 12345;
1991     return (uint32_t)(m_seed/65536) % 32768;
1992 }
1993
1994
1995 //------------------------------------------------------------------------------------------
1996 //
1997 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
1998 //                             of RBBIMonkeyKind.
1999 //
2000 //------------------------------------------------------------------------------------------
2001 class RBBICharMonkey: public RBBIMonkeyKind {
2002 public:
2003     RBBICharMonkey();
2004     virtual          ~RBBICharMonkey();
2005     virtual  UVector *charClasses();
2006     virtual  void     setText(const UnicodeString &s);
2007     virtual  int32_t  next(int32_t i);
2008 private:
2009     UVector   *fSets;
2010
2011     UnicodeSet  *fCRLFSet;
2012     UnicodeSet  *fControlSet;
2013     UnicodeSet  *fExtendSet;
2014     UnicodeSet  *fRegionalIndicatorSet;
2015     UnicodeSet  *fPrependSet;
2016     UnicodeSet  *fSpacingSet;
2017     UnicodeSet  *fLSet;
2018     UnicodeSet  *fVSet;
2019     UnicodeSet  *fTSet;
2020     UnicodeSet  *fLVSet;
2021     UnicodeSet  *fLVTSet;
2022     UnicodeSet  *fHangulSet;
2023     UnicodeSet  *fAnySet;
2024
2025     const UnicodeString *fText;
2026 };
2027
2028
2029 RBBICharMonkey::RBBICharMonkey() {
2030     UErrorCode  status = U_ZERO_ERROR;
2031
2032     fText = NULL;
2033
2034     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2035     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2036     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2037     fRegionalIndicatorSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"), status);
2038     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2039     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2040     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2041     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2042     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2043     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2044     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2045     fHangulSet  = new UnicodeSet();
2046     fHangulSet->addAll(*fLSet);
2047     fHangulSet->addAll(*fVSet);
2048     fHangulSet->addAll(*fTSet);
2049     fHangulSet->addAll(*fLVSet);
2050     fHangulSet->addAll(*fLVTSet);
2051     fAnySet     = new UnicodeSet(0, 0x10ffff);
2052
2053     fSets       = new UVector(status);
2054     fSets->addElement(fCRLFSet,    status);
2055     fSets->addElement(fControlSet, status);
2056     fSets->addElement(fExtendSet,  status);
2057     fSets->addElement(fRegionalIndicatorSet, status);
2058     if (!fPrependSet->isEmpty()) {
2059         fSets->addElement(fPrependSet, status);
2060     }
2061     fSets->addElement(fSpacingSet, status);
2062     fSets->addElement(fHangulSet,  status);
2063     fSets->addElement(fAnySet,     status);
2064     if (U_FAILURE(status)) {
2065         deferredStatus = status;
2066     }
2067 }
2068
2069
2070 void RBBICharMonkey::setText(const UnicodeString &s) {
2071     fText = &s;
2072 }
2073
2074
2075
2076 int32_t RBBICharMonkey::next(int32_t prevPos) {
2077     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2078                               //   break position being tested.  The candidate break
2079                               //   location is before p2.
2080
2081     int     breakPos = -1;
2082
2083     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2084
2085     if (U_FAILURE(deferredStatus)) {
2086         return -1;
2087     }
2088
2089     // Previous break at end of string.  return DONE.
2090     if (prevPos >= fText->length()) {
2091         return -1;
2092     }
2093     p0 = p1 = p2 = p3 = prevPos;
2094     c3 =  fText->char32At(prevPos);
2095     c0 = c1 = c2 = 0;
2096     (void)p0;   // suppress set but not used warning.
2097     (void)c0;
2098
2099     // Loop runs once per "significant" character position in the input text.
2100     for (;;) {
2101         // Move all of the positions forward in the input string.
2102         p0 = p1;  c0 = c1;
2103         p1 = p2;  c1 = c2;
2104         p2 = p3;  c2 = c3;
2105
2106         // Advancd p3 by one codepoint
2107         p3 = fText->moveIndex32(p3, 1);
2108         c3 = fText->char32At(p3);
2109
2110         if (p1 == p2) {
2111             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2112             continue;
2113         }
2114         if (p2 == fText->length()) {
2115             // Reached end of string.  Always a break position.
2116             break;
2117         }
2118
2119         // Rule  GB3   CR x LF
2120         //     No Extend or Format characters may appear between the CR and LF,
2121         //     which requires the additional check for p2 immediately following p1.
2122         //
2123         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2124             continue;
2125         }
2126
2127         // Rule (GB4).   ( Control | CR | LF ) <break>
2128         if (fControlSet->contains(c1) ||
2129             c1 == 0x0D ||
2130             c1 == 0x0A)  {
2131             break;
2132         }
2133
2134         // Rule (GB5)    <break>  ( Control | CR | LF )
2135         //
2136         if (fControlSet->contains(c2) ||
2137             c2 == 0x0D ||
2138             c2 == 0x0A)  {
2139             break;
2140         }
2141
2142
2143         // Rule (GB6)  L x ( L | V | LV | LVT )
2144         if (fLSet->contains(c1) &&
2145                (fLSet->contains(c2)  ||
2146                 fVSet->contains(c2)  ||
2147                 fLVSet->contains(c2) ||
2148                 fLVTSet->contains(c2))) {
2149             continue;
2150         }
2151
2152         // Rule (GB7)    ( LV | V )  x  ( V | T )
2153         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2154             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2155             continue;
2156         }
2157
2158         // Rule (GB8)    ( LVT | T)  x T
2159         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2160             fTSet->contains(c2))  {
2161             continue;
2162         }
2163
2164         // Just adding extra Apple rule does here not work, behavior depends on arbitrary context
2165
2166         // Rule (GB8a)    Regional_Indicator x Regional_Indicator
2167         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2168             continue;
2169         }
2170
2171         // Rule (GB9)    Numeric x ALetter
2172         if (fExtendSet->contains(c2))  {
2173             continue;
2174         }
2175
2176         // Rule (GB9a)   x  SpacingMark
2177         if (fSpacingSet->contains(c2)) {
2178             continue;
2179         }
2180
2181         // Rule (GB9b)   Prepend x
2182         if (fPrependSet->contains(c1)) {
2183             continue;
2184         }
2185
2186         // Rule (GB10)  Any  <break>  Any
2187         break;
2188     }
2189
2190     breakPos = p2;
2191     return breakPos;
2192 }
2193
2194
2195
2196 UVector  *RBBICharMonkey::charClasses() {
2197     return fSets;
2198 }
2199
2200
2201 RBBICharMonkey::~RBBICharMonkey() {
2202     delete fSets;
2203     delete fCRLFSet;
2204     delete fControlSet;
2205     delete fExtendSet;
2206     delete fRegionalIndicatorSet;
2207     delete fPrependSet;
2208     delete fSpacingSet;
2209     delete fLSet;
2210     delete fVSet;
2211     delete fTSet;
2212     delete fLVSet;
2213     delete fLVTSet;
2214     delete fHangulSet;
2215     delete fAnySet;
2216 }
2217
2218 //------------------------------------------------------------------------------------------
2219 //
2220 //   class RBBIWordMonkey      Word Break specific implementation
2221 //                             of RBBIMonkeyKind.
2222 //
2223 //------------------------------------------------------------------------------------------
2224 class RBBIWordMonkey: public RBBIMonkeyKind {
2225 public:
2226     RBBIWordMonkey();
2227     virtual          ~RBBIWordMonkey();
2228     virtual  UVector *charClasses();
2229     virtual  void     setText(const UnicodeString &s);
2230     virtual int32_t   next(int32_t i);
2231 private:
2232     UVector      *fSets;
2233
2234     UnicodeSet  *fCRSet;
2235     UnicodeSet  *fLFSet;
2236     UnicodeSet  *fNewlineSet;
2237     UnicodeSet  *fRegionalIndicatorSet;
2238     UnicodeSet  *fKatakanaSet;
2239     UnicodeSet  *fHebrew_LetterSet;
2240     UnicodeSet  *fALetterSet;
2241     // TODO(jungshik): Do we still need this change?
2242     // UnicodeSet  *fALetterSet; // matches ALetterPlus in word.txt
2243     UnicodeSet  *fSingle_QuoteSet;
2244     UnicodeSet  *fDouble_QuoteSet;
2245     UnicodeSet  *fMidNumLetSet;
2246     UnicodeSet  *fMidLetterSet;
2247     UnicodeSet  *fMidNumSet;
2248     UnicodeSet  *fNumericSet;
2249     UnicodeSet  *fFormatSet;
2250     UnicodeSet  *fOtherSet;
2251     UnicodeSet  *fExtendSet;
2252     UnicodeSet  *fExtendNumLetSet;
2253     UnicodeSet  *fDictionaryCjkSet;
2254
2255     const UnicodeString  *fText;
2256 };
2257
2258
2259 RBBIWordMonkey::RBBIWordMonkey()
2260 {
2261     UErrorCode  status = U_ZERO_ERROR;
2262
2263     fSets            = new UVector(status);
2264
2265     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2266     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2267     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2268     fDictionaryCjkSet= new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:][:Katakana:]]", status);
2269     // Exclude Hangul syllables from ALetterSet during testing.
2270     // Leave CJK dictionary characters out from the monkey tests!
2271 #if 0
2272     fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}"
2273                                       "[\\p{Line_Break = Complex_Context}"
2274                                       "-\\p{Grapheme_Cluster_Break = Extend}"
2275                                       "-\\p{Grapheme_Cluster_Break = Control}"
2276                                       "]]",
2277                                       status);
2278 #endif
2279     fRegionalIndicatorSet =  new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Regional_Indicator}]"), status);
2280     fKatakanaSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2281     fHebrew_LetterSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Hebrew_Letter}]"), status);
2282     fALetterSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"), status);
2283     fALetterSet->removeAll(*fDictionaryCjkSet);
2284     fSingle_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Single_Quote}]"),    status);
2285     fDouble_QuoteSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Double_Quote}]"),    status);
2286     fMidNumLetSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2287     fMidLetterSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2288     fMidNumSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2289     // TODO: this set used to contain [\\uff10-\\uff19] (fullwidth digits), but this breaks the test
2290     // we should figure out why
2291     fNumericSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2292     fFormatSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2293     fExtendNumLetSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2294     fExtendSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2295
2296     fOtherSet        = new UnicodeSet();
2297     if(U_FAILURE(status)) {
2298       deferredStatus = status;
2299       return;
2300     }
2301
2302     fOtherSet->complement();
2303     fOtherSet->removeAll(*fCRSet);
2304     fOtherSet->removeAll(*fLFSet);
2305     fOtherSet->removeAll(*fNewlineSet);
2306     fOtherSet->removeAll(*fKatakanaSet);
2307     fOtherSet->removeAll(*fHebrew_LetterSet);
2308     fOtherSet->removeAll(*fALetterSet);
2309     fOtherSet->removeAll(*fSingle_QuoteSet);
2310     fOtherSet->removeAll(*fDouble_QuoteSet);
2311     fOtherSet->removeAll(*fMidLetterSet);
2312     fOtherSet->removeAll(*fMidNumSet);
2313     fOtherSet->removeAll(*fNumericSet);
2314     fOtherSet->removeAll(*fExtendNumLetSet);
2315     fOtherSet->removeAll(*fFormatSet);
2316     fOtherSet->removeAll(*fExtendSet);
2317     fOtherSet->removeAll(*fRegionalIndicatorSet);
2318     // Inhibit dictionary characters from being tested at all.
2319     fOtherSet->removeAll(*fDictionaryCjkSet);
2320     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2321
2322     fSets->addElement(fCRSet,                status);
2323     fSets->addElement(fLFSet,                status);
2324     fSets->addElement(fNewlineSet,           status);
2325     fSets->addElement(fRegionalIndicatorSet, status);
2326     fSets->addElement(fHebrew_LetterSet,     status);
2327     fSets->addElement(fALetterSet,           status);
2328     fSets->addElement(fSingle_QuoteSet,      status);
2329     fSets->addElement(fDouble_QuoteSet,      status);
2330     //fSets->addElement(fKatakanaSet,          status); //TODO: work out how to test katakana
2331     fSets->addElement(fMidLetterSet,         status);
2332     fSets->addElement(fMidNumLetSet,         status);
2333     fSets->addElement(fMidNumSet,            status);
2334     fSets->addElement(fNumericSet,           status);
2335     fSets->addElement(fFormatSet,            status);
2336     fSets->addElement(fExtendSet,            status);
2337     fSets->addElement(fOtherSet,             status);
2338     fSets->addElement(fExtendNumLetSet,      status);
2339
2340     if (U_FAILURE(status)) {
2341         deferredStatus = status;
2342     }
2343 }
2344
2345 void RBBIWordMonkey::setText(const UnicodeString &s) {
2346     fText       = &s;
2347 }
2348
2349
2350 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2351     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2352                               //   break position being tested.  The candidate break
2353                               //   location is before p2.
2354
2355     int     breakPos = -1;
2356
2357     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2358
2359     if (U_FAILURE(deferredStatus)) {
2360         return -1;
2361     }
2362
2363     // Prev break at end of string.  return DONE.
2364     if (prevPos >= fText->length()) {
2365         return -1;
2366     }
2367     p0 = p1 = p2 = p3 = prevPos;
2368     c3 =  fText->char32At(prevPos);
2369     c0 = c1 = c2 = 0;
2370     (void)p0;       // Suppress set but not used warning.
2371
2372     // Loop runs once per "significant" character position in the input text.
2373     for (;;) {
2374         // Move all of the positions forward in the input string.
2375         p0 = p1;  c0 = c1;
2376         p1 = p2;  c1 = c2;
2377         p2 = p3;  c2 = c3;
2378
2379         // Advancd p3 by    X(Extend | Format)*   Rule 4
2380         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2381         do {
2382             p3 = fText->moveIndex32(p3, 1);
2383             c3 = fText->char32At(p3);
2384             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2385                break;
2386             };
2387         }
2388         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2389
2390
2391         if (p1 == p2) {
2392             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2393             continue;
2394         }
2395         if (p2 == fText->length()) {
2396             // Reached end of string.  Always a break position.
2397             break;
2398         }
2399
2400         // Rule  (3)   CR x LF
2401         //     No Extend or Format characters may appear between the CR and LF,
2402         //     which requires the additional check for p2 immediately following p1.
2403         //
2404         if (c1==0x0D && c2==0x0A) {
2405             continue;
2406         }
2407
2408         // Rule (3a)  Break before and after newlines (including CR and LF)
2409         //
2410         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2411             break;
2412         };
2413         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2414             break;
2415         };
2416
2417         // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
2418         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2419             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2420             continue;
2421         }
2422
2423         // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
2424         //
2425         if ( (fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1))   &&
2426              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2)) &&
2427              (fALetterSet->contains(c3) || fHebrew_LetterSet->contains(c3))) {
2428             continue;
2429         }
2430
2431         // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
2432         if ((fALetterSet->contains(c0) || fHebrew_LetterSet->contains(c0)) &&
2433             (fMidLetterSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1)) &&
2434             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2))) {
2435             continue;
2436         }
2437
2438         // Rule (7a)     Hebrew_Letter x Single_Quote
2439         if (fHebrew_LetterSet->contains(c1) && fSingle_QuoteSet->contains(c2)) {
2440             continue;
2441         }
2442
2443         // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
2444         if (fHebrew_LetterSet->contains(c1) && fDouble_QuoteSet->contains(c2) && fHebrew_LetterSet->contains(c3)) {
2445             continue;
2446         }
2447
2448         // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
2449         if (fHebrew_LetterSet->contains(c0) && fDouble_QuoteSet->contains(c1) && fHebrew_LetterSet->contains(c2)) {
2450             continue;
2451         }
2452
2453         // Rule (8)    Numeric x Numeric
2454         if (fNumericSet->contains(c1) &&
2455             fNumericSet->contains(c2))  {
2456             continue;
2457         }
2458
2459         // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
2460         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1)) &&
2461             fNumericSet->contains(c2))  {
2462             continue;
2463         }
2464
2465         // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
2466         if (fNumericSet->contains(c1) &&
2467             (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2)))  {
2468             continue;
2469         }
2470
2471         // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
2472         if (fNumericSet->contains(c0) &&
2473             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1) || fSingle_QuoteSet->contains(c1))  &&
2474             fNumericSet->contains(c2)) {
2475             continue;
2476         }
2477
2478         // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
2479         if (fNumericSet->contains(c1) &&
2480             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2) || fSingle_QuoteSet->contains(c2))  &&
2481             fNumericSet->contains(c3)) {
2482             continue;
2483         }
2484
2485         // Rule (13)  Katakana x Katakana
2486         if (fKatakanaSet->contains(c1) &&
2487             fKatakanaSet->contains(c2))  {
2488             continue;
2489         }
2490
2491         // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
2492         if ((fALetterSet->contains(c1) || fHebrew_LetterSet->contains(c1) ||fNumericSet->contains(c1) ||
2493              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2494              fExtendNumLetSet->contains(c2)) {
2495                 continue;
2496         }
2497
2498         // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
2499         if (fExtendNumLetSet->contains(c1) &&
2500                 (fALetterSet->contains(c2) || fHebrew_LetterSet->contains(c2) ||
2501                  fNumericSet->contains(c2) || fKatakanaSet->contains(c2)))  {
2502             continue;
2503         }
2504
2505         // Rule 13c
2506         if (fRegionalIndicatorSet->contains(c1) && fRegionalIndicatorSet->contains(c2)) {
2507             continue;
2508         }
2509
2510         // Rule 14.  Break found here.
2511         break;
2512     }
2513
2514     breakPos = p2;
2515     return breakPos;
2516 }
2517
2518
2519 UVector  *RBBIWordMonkey::charClasses() {
2520     return fSets;
2521 }
2522
2523
2524 RBBIWordMonkey::~RBBIWordMonkey() {
2525     delete fSets;
2526     delete fCRSet;
2527     delete fLFSet;
2528     delete fNewlineSet;
2529     delete fKatakanaSet;
2530     delete fHebrew_LetterSet;
2531     delete fALetterSet;
2532     delete fSingle_QuoteSet;
2533     delete fDouble_QuoteSet;
2534     delete fMidNumLetSet;
2535     delete fMidLetterSet;
2536     delete fMidNumSet;
2537     delete fNumericSet;
2538     delete fFormatSet;
2539     delete fExtendSet;
2540     delete fExtendNumLetSet;
2541     delete fRegionalIndicatorSet;
2542     delete fDictionaryCjkSet;
2543     delete fOtherSet;
2544 }
2545
2546
2547
2548
2549 //------------------------------------------------------------------------------------------
2550 //
2551 //   class RBBISentMonkey      Sentence Break specific implementation
2552 //                             of RBBIMonkeyKind.
2553 //
2554 //------------------------------------------------------------------------------------------
2555 class RBBISentMonkey: public RBBIMonkeyKind {
2556 public:
2557     RBBISentMonkey();
2558     virtual          ~RBBISentMonkey();
2559     virtual  UVector *charClasses();
2560     virtual  void     setText(const UnicodeString &s);
2561     virtual int32_t   next(int32_t i);
2562 private:
2563     int               moveBack(int posFrom);
2564     int               moveForward(int posFrom);
2565     UChar32           cAt(int pos);
2566
2567     UVector      *fSets;
2568
2569     UnicodeSet  *fSepSet;
2570     UnicodeSet  *fFormatSet;
2571     UnicodeSet  *fSpSet;
2572     UnicodeSet  *fLowerSet;
2573     UnicodeSet  *fUpperSet;
2574     UnicodeSet  *fOLetterSet;
2575     UnicodeSet  *fNumericSet;
2576     UnicodeSet  *fATermSet;
2577     UnicodeSet  *fSContinueSet;
2578     UnicodeSet  *fSTermSet;
2579     UnicodeSet  *fCloseSet;
2580     UnicodeSet  *fOtherSet;
2581     UnicodeSet  *fExtendSet;
2582
2583     const UnicodeString  *fText;
2584
2585 };
2586
2587 RBBISentMonkey::RBBISentMonkey()
2588 {
2589     UErrorCode  status = U_ZERO_ERROR;
2590
2591     fSets            = new UVector(status);
2592
2593     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2594     //                       set and made into character classes of their own.  For the monkey impl,
2595     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2596     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2597     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2598     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2599     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2600     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2601     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2602     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2603     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2604     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2605     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2606     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2607     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2608     fOtherSet        = new UnicodeSet();
2609
2610     if(U_FAILURE(status)) {
2611       deferredStatus = status;
2612       return;
2613     }
2614
2615     fOtherSet->complement();
2616     fOtherSet->removeAll(*fSepSet);
2617     fOtherSet->removeAll(*fFormatSet);
2618     fOtherSet->removeAll(*fSpSet);
2619     fOtherSet->removeAll(*fLowerSet);
2620     fOtherSet->removeAll(*fUpperSet);
2621     fOtherSet->removeAll(*fOLetterSet);
2622     fOtherSet->removeAll(*fNumericSet);
2623     fOtherSet->removeAll(*fATermSet);
2624     fOtherSet->removeAll(*fSContinueSet);
2625     fOtherSet->removeAll(*fSTermSet);
2626     fOtherSet->removeAll(*fCloseSet);
2627     fOtherSet->removeAll(*fExtendSet);
2628
2629     fSets->addElement(fSepSet,       status);
2630     fSets->addElement(fFormatSet,    status);
2631     fSets->addElement(fSpSet,        status);
2632     fSets->addElement(fLowerSet,     status);
2633     fSets->addElement(fUpperSet,     status);
2634     fSets->addElement(fOLetterSet,   status);
2635     fSets->addElement(fNumericSet,   status);
2636     fSets->addElement(fATermSet,     status);
2637     fSets->addElement(fSContinueSet, status);
2638     fSets->addElement(fSTermSet,     status);
2639     fSets->addElement(fCloseSet,     status);
2640     fSets->addElement(fOtherSet,     status);
2641     fSets->addElement(fExtendSet,    status);
2642
2643     if (U_FAILURE(status)) {
2644         deferredStatus = status;
2645     }
2646 }
2647
2648
2649
2650 void RBBISentMonkey::setText(const UnicodeString &s) {
2651     fText       = &s;
2652 }
2653
2654 UVector  *RBBISentMonkey::charClasses() {
2655     return fSets;
2656 }
2657
2658
2659 //  moveBack()   Find the "significant" code point preceding the index i.
2660 //               Skips over ($Extend | $Format)* .
2661 //
2662 int RBBISentMonkey::moveBack(int i) {
2663     if (i <= 0) {
2664         return -1;
2665     }
2666     UChar32   c;
2667     int32_t   j = i;
2668     do {
2669         j = fText->moveIndex32(j, -1);
2670         c = fText->char32At(j);
2671     }
2672     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
2673     return j;
2674
2675  }
2676
2677
2678 int RBBISentMonkey::moveForward(int i) {
2679     if (i>=fText->length()) {
2680         return fText->length();
2681     }
2682     UChar32   c;
2683     int32_t   j = i;
2684     do {
2685         j = fText->moveIndex32(j, 1);
2686         c = cAt(j);
2687     }
2688     while (fFormatSet->contains(c) || fExtendSet->contains(c));
2689     return j;
2690 }
2691
2692 UChar32 RBBISentMonkey::cAt(int pos) {
2693     if (pos<0 || pos>=fText->length()) {
2694         return -1;
2695     } else {
2696         return fText->char32At(pos);
2697     }
2698 }
2699
2700 int32_t RBBISentMonkey::next(int32_t prevPos) {
2701     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2702                               //   break position being tested.  The candidate break
2703                               //   location is before p2.
2704
2705     int     breakPos = -1;
2706
2707     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2708     UChar32 c;
2709
2710     if (U_FAILURE(deferredStatus)) {
2711         return -1;
2712     }
2713
2714     // Prev break at end of string.  return DONE.
2715     if (prevPos >= fText->length()) {
2716         return -1;
2717     }
2718     p0 = p1 = p2 = p3 = prevPos;
2719     c3 =  fText->char32At(prevPos);
2720     c0 = c1 = c2 = 0;
2721     (void)p0;     // Suppress set but not used warning.
2722
2723     // Loop runs once per "significant" character position in the input text.
2724     for (;;) {
2725         // Move all of the positions forward in the input string.
2726         p0 = p1;  c0 = c1;
2727         p1 = p2;  c1 = c2;
2728         p2 = p3;  c2 = c3;
2729
2730         // Advancd p3 by    X(Extend | Format)*   Rule 4
2731         p3 = moveForward(p3);
2732         c3 = cAt(p3);
2733
2734         // Rule (3)  CR x LF
2735         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
2736             continue;
2737         }
2738
2739         // Rule (4).   Sep  <break>
2740         if (fSepSet->contains(c1)) {
2741             p2 = p1+1;   // Separators don't combine with Extend or Format.
2742             break;
2743         }
2744
2745         if (p2 >= fText->length()) {
2746             // Reached end of string.  Always a break position.
2747             break;
2748         }
2749
2750         if (p2 == prevPos) {
2751             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2752             continue;
2753         }
2754
2755         // Rule (6).   ATerm x Numeric
2756         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
2757             continue;
2758         }
2759
2760         // Rule (7).  Upper ATerm  x  Uppper
2761         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
2762             continue;
2763         }
2764
2765         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
2766         //           Note:  STerm | ATerm are added to the negated part of the expression by a
2767         //                  note to the Unicode 5.0 documents.
2768         int p8 = p1;
2769         while (fSpSet->contains(cAt(p8))) {
2770             p8 = moveBack(p8);
2771         }
2772         while (fCloseSet->contains(cAt(p8))) {
2773             p8 = moveBack(p8);
2774         }
2775         if (fATermSet->contains(cAt(p8))) {
2776             p8=p2;
2777             for (;;) {
2778                 c = cAt(p8);
2779                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
2780                     fLowerSet->contains(c) || fSepSet->contains(c) ||
2781                     fATermSet->contains(c) || fSTermSet->contains(c))  {
2782                     break;
2783                 }
2784                 p8 = moveForward(p8);
2785             }
2786             if (fLowerSet->contains(cAt(p8))) {
2787                 continue;
2788             }
2789         }
2790
2791         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
2792         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
2793             p8 = p1;
2794             while (fSpSet->contains(cAt(p8))) {
2795                 p8 = moveBack(p8);
2796             }
2797             while (fCloseSet->contains(cAt(p8))) {
2798                 p8 = moveBack(p8);
2799             }
2800             c = cAt(p8);
2801             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
2802                 continue;
2803             }
2804         }
2805
2806         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
2807         int p9 = p1;
2808         while (fCloseSet->contains(cAt(p9))) {
2809             p9 = moveBack(p9);
2810         }
2811         c = cAt(p9);
2812         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
2813             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
2814                 continue;
2815             }
2816         }
2817
2818         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
2819         int p10 = p1;
2820         while (fSpSet->contains(cAt(p10))) {
2821             p10 = moveBack(p10);
2822         }
2823         while (fCloseSet->contains(cAt(p10))) {
2824             p10 = moveBack(p10);
2825         }
2826         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
2827             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
2828                 continue;
2829             }
2830         }
2831
2832         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
2833         int p11 = p1;
2834         if (fSepSet->contains(cAt(p11))) {
2835             p11 = moveBack(p11);
2836         }
2837         while (fSpSet->contains(cAt(p11))) {
2838             p11 = moveBack(p11);
2839         }
2840         while (fCloseSet->contains(cAt(p11))) {
2841             p11 = moveBack(p11);
2842         }
2843         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
2844             break;
2845         }
2846
2847         //  Rule (12)  Any x Any
2848         continue;
2849     }
2850     breakPos = p2;
2851     return breakPos;
2852 }
2853
2854 RBBISentMonkey::~RBBISentMonkey() {
2855     delete fSets;
2856     delete fSepSet;
2857     delete fFormatSet;
2858     delete fSpSet;
2859     delete fLowerSet;
2860     delete fUpperSet;
2861     delete fOLetterSet;
2862     delete fNumericSet;
2863     delete fATermSet;
2864     delete fSContinueSet;
2865     delete fSTermSet;
2866     delete fCloseSet;
2867     delete fOtherSet;
2868     delete fExtendSet;
2869 }
2870
2871
2872
2873 //-------------------------------------------------------------------------------------------
2874 //
2875 //  RBBILineMonkey
2876 //
2877 //-------------------------------------------------------------------------------------------
2878
2879 class RBBILineMonkey: public RBBIMonkeyKind {
2880 public:
2881     RBBILineMonkey();
2882     virtual          ~RBBILineMonkey();
2883     virtual  UVector *charClasses();
2884     virtual  void     setText(const UnicodeString &s);
2885     virtual  int32_t  next(int32_t i);
2886     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
2887 private:
2888     UVector      *fSets;
2889
2890     UnicodeSet  *fBK;
2891     UnicodeSet  *fCR;
2892     UnicodeSet  *fLF;
2893     UnicodeSet  *fCM;
2894     UnicodeSet  *fNL;
2895     UnicodeSet  *fSG;
2896     UnicodeSet  *fWJ;
2897     UnicodeSet  *fZW;
2898     UnicodeSet  *fGL;
2899     UnicodeSet  *fCB;
2900     UnicodeSet  *fSP;
2901     UnicodeSet  *fB2;
2902     UnicodeSet  *fBA;
2903     UnicodeSet  *fBB;
2904     UnicodeSet  *fHY;
2905     UnicodeSet  *fH2;
2906     UnicodeSet  *fH3;
2907     UnicodeSet  *fCL;
2908     UnicodeSet  *fCP;
2909     UnicodeSet  *fEX;
2910     UnicodeSet  *fIN;
2911     UnicodeSet  *fJL;
2912     UnicodeSet  *fJV;
2913     UnicodeSet  *fJT;
2914     UnicodeSet  *fNS;
2915     UnicodeSet  *fOP;
2916     UnicodeSet  *fQU;
2917     UnicodeSet  *fIS;
2918     UnicodeSet  *fNU;
2919     UnicodeSet  *fPO;
2920     UnicodeSet  *fPR;
2921     UnicodeSet  *fSY;
2922     UnicodeSet  *fAI;
2923     UnicodeSet  *fAL;
2924     UnicodeSet  *fCJ;
2925     UnicodeSet  *fHL;
2926     UnicodeSet  *fID;
2927     UnicodeSet  *fRI;
2928     UnicodeSet  *fSA;
2929     UnicodeSet  *fXX;
2930
2931     BreakIterator        *fCharBI;
2932     const UnicodeString  *fText;
2933     RegexMatcher         *fNumberMatcher;
2934 };
2935
2936
2937 RBBILineMonkey::RBBILineMonkey()
2938 {
2939     UErrorCode  status = U_ZERO_ERROR;
2940
2941     fSets  = new UVector(status);
2942
2943     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
2944     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
2945     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
2946     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
2947     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
2948     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
2949     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
2950     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
2951     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
2952     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
2953     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
2954     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
2955     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
2956     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
2957     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
2958     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
2959     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
2960     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
2961     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
2962     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
2963     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
2964     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
2965     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
2966     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
2967     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
2968     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
2969     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
2970     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
2971     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
2972     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
2973     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
2974     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
2975     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
2976     fCJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CJ}]"), status);
2977     fHL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HL}]"), status);
2978     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
2979     fRI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=RI}]"), status);
2980     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
2981     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
2982     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
2983
2984     if (U_FAILURE(status)) {
2985         deferredStatus = status;
2986         fCharBI = NULL;
2987         fNumberMatcher = NULL;
2988         return;
2989     }
2990
2991     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
2992     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
2993     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
2994     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
2995
2996     fNS->addAll(*fCJ);     // Default behavior for CJ is identical to NS.
2997
2998     fSets->addElement(fBK, status);
2999     fSets->addElement(fCR, status);
3000     fSets->addElement(fLF, status);
3001     fSets->addElement(fCM, status);
3002     fSets->addElement(fNL, status);
3003     fSets->addElement(fWJ, status);
3004     fSets->addElement(fZW, status);
3005     fSets->addElement(fGL, status);
3006     fSets->addElement(fCB, status);
3007     fSets->addElement(fSP, status);
3008     fSets->addElement(fB2, status);
3009     fSets->addElement(fBA, status);
3010     fSets->addElement(fBB, status);
3011     fSets->addElement(fHY, status);
3012     fSets->addElement(fH2, status);
3013     fSets->addElement(fH3, status);
3014     fSets->addElement(fCL, status);
3015     fSets->addElement(fCP, status);
3016     fSets->addElement(fEX, status);
3017     fSets->addElement(fIN, status);
3018     fSets->addElement(fJL, status);
3019     fSets->addElement(fJT, status);
3020     fSets->addElement(fJV, status);
3021     fSets->addElement(fNS, status);
3022     fSets->addElement(fOP, status);
3023     fSets->addElement(fQU, status);
3024     fSets->addElement(fIS, status);
3025     fSets->addElement(fNU, status);
3026     fSets->addElement(fPO, status);
3027     fSets->addElement(fPR, status);
3028     fSets->addElement(fSY, status);
3029     fSets->addElement(fAI, status);
3030     fSets->addElement(fAL, status);
3031     fSets->addElement(fHL, status);
3032     fSets->addElement(fID, status);
3033     fSets->addElement(fWJ, status);
3034     fSets->addElement(fRI, status);
3035     fSets->addElement(fSA, status);
3036     fSets->addElement(fSG, status);
3037
3038     const char *rules =
3039             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3040             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3041             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3042             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3043             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3044             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3045
3046     fNumberMatcher = new RegexMatcher(
3047         UnicodeString(rules, -1, US_INV), 0, status);
3048
3049     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3050
3051     if (U_FAILURE(status)) {
3052         deferredStatus = status;
3053     }
3054 }
3055
3056
3057 void RBBILineMonkey::setText(const UnicodeString &s) {
3058     fText       = &s;
3059     fCharBI->setText(s);
3060     fNumberMatcher->reset(s);
3061 }
3062
3063 //
3064 //  rule9Adjust
3065 //     Line Break TR rules 9 and 10 implementation.
3066 //     This deals with combining marks and other sequences that
3067 //     that must be treated as if they were something other than what they actually are.
3068 //
3069 //     This is factored out into a separate function because it must be applied twice for
3070 //     each potential break, once to the chars before the position being checked, then
3071 //     again to the text following the possible break.
3072 //
3073 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3074     if (pos == -1) {
3075         // Invalid initial position.  Happens during the warmup iteration of the
3076         //   main loop in next().
3077         return;
3078     }
3079
3080     int32_t  nPos = *nextPos;
3081
3082     // LB 9  Keep combining sequences together.
3083     //  advance over any CM class chars.  Note that Line Break CM is different
3084     //  from the normal Grapheme Extend property.
3085     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3086           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3087         for (;;) {
3088             *nextChar = fText->char32At(nPos);
3089             if (!fCM->contains(*nextChar)) {
3090                 break;
3091             }
3092             nPos = fText->moveIndex32(nPos, 1);
3093         }
3094     }
3095
3096
3097     // LB 9 Treat X CM* as if it were x.
3098     //       No explicit action required.
3099
3100     // LB 10  Treat any remaining combining mark as AL
3101     if (fCM->contains(*posChar)) {
3102         *posChar = 0x41;   // thisChar = 'A';
3103     }
3104
3105     // Push the updated nextPos and nextChar back to our caller.
3106     // This only makes a difference if posChar got bigger by consuming a
3107     // combining sequence.
3108     *nextPos  = nPos;
3109     *nextChar = fText->char32At(nPos);
3110 }
3111
3112
3113
3114 int32_t RBBILineMonkey::next(int32_t startPos) {
3115     UErrorCode status = U_ZERO_ERROR;
3116     int32_t    pos;       //  Index of the char following a potential break position
3117     UChar32    thisChar;  //  Character at above position "pos"
3118
3119     int32_t    prevPos;   //  Index of the char preceding a potential break position
3120     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3121                           //   and thisChar may not be adjacent because combining
3122                           //   characters between them will be ignored.
3123
3124     int32_t    prevPosX2; //  Second previous character.  Wider context for LB21a.
3125     UChar32    prevCharX2;
3126
3127     int32_t    nextPos;   //  Index of the next character following pos.
3128                           //     Usually skips over combining marks.
3129     int32_t    nextCPPos; //  Index of the code point following "pos."
3130                           //     May point to a combining mark.
3131     int32_t    tPos;      //  temp value.
3132     UChar32    c;
3133
3134     if (U_FAILURE(deferredStatus)) {
3135         return -1;
3136     }
3137
3138     if (startPos >= fText->length()) {
3139         return -1;
3140     }
3141
3142
3143     // Initial values for loop.  Loop will run the first time without finding breaks,
3144     //                           while the invalid values shift out and the "this" and
3145     //                           "prev" positions are filled in with good values.
3146     pos      = prevPos   = prevPosX2  = -1;    // Invalid value, serves as flag for initial loop iteration.
3147     thisChar = prevChar  = prevCharX2 = 0;
3148     nextPos  = nextCPPos = startPos;
3149
3150
3151     // Loop runs once per position in the test text, until a break position
3152     //  is found.
3153     for (;;) {
3154         prevPosX2 = prevPos;
3155         prevCharX2 = prevChar;
3156
3157         prevPos   = pos;
3158         prevChar  = thisChar;
3159
3160         pos       = nextPos;
3161         thisChar  = fText->char32At(pos);
3162
3163         nextCPPos = fText->moveIndex32(pos, 1);
3164         nextPos   = nextCPPos;
3165
3166         // Rule LB2 - Break at end of text.
3167         if (pos >= fText->length()) {
3168             break;
3169         }
3170
3171         // Rule LB 9 - adjust for combining sequences.
3172         //             We do this one out-of-order because the adjustment does not change anything
3173         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3174         //             be applied.
3175         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3176         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3177         c = fText->char32At(nextPos);
3178         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3179
3180         // If the loop is still warming up - if we haven't shifted the initial
3181         //   -1 positions out of prevPos yet - loop back to advance the
3182         //    position in the input without any further looking for breaks.
3183         if (prevPos == -1) {
3184             continue;
3185         }
3186
3187         // LB 4  Always break after hard line breaks,
3188         if (fBK->contains(prevChar)) {
3189             break;
3190         }
3191
3192         // LB 5  Break after CR, LF, NL, but not inside CR LF
3193         if (prevChar == 0x0d && thisChar == 0x0a) {
3194             continue;
3195         }
3196         if (prevChar == 0x0d ||
3197             prevChar == 0x0a ||
3198             prevChar == 0x85)  {
3199             break;
3200         }
3201
3202         // LB 6  Don't break before hard line breaks
3203         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3204             fBK->contains(thisChar)) {
3205                 continue;
3206         }
3207
3208
3209         // LB 7  Don't break before spaces or zero-width space.
3210         if (fSP->contains(thisChar)) {
3211             continue;
3212         }
3213
3214         if (fZW->contains(thisChar)) {
3215             continue;
3216         }
3217
3218         // LB 8  Break after zero width space
3219         if (fZW->contains(prevChar)) {
3220             break;
3221         }
3222
3223         // LB 9, 10  Already done, at top of loop.
3224         //
3225
3226
3227         // LB 11  Do not break before or after WORD JOINER and related characters.
3228         //    x  WJ
3229         //    WJ  x
3230         //
3231         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3232             continue;
3233         }
3234
3235         // LB 12
3236         //    GL  x
3237         if (fGL->contains(prevChar)) {
3238             continue;
3239         }
3240
3241         // LB 12a
3242         //    [^SP BA HY] x GL
3243         if (!(fSP->contains(prevChar) ||
3244               fBA->contains(prevChar) ||
3245               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3246             continue;
3247         }
3248
3249
3250
3251         // LB 13  Don't break before closings.
3252         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3253         //        fall into LB 17 and the more general number regular expression.
3254         //
3255         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3256             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3257                                          fEX->contains(thisChar)  ||
3258             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3259             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3260             continue;
3261         }
3262
3263         // LB 14 Don't break after OP SP*
3264         //       Scan backwards, checking for this sequence.
3265         //       The OP char could include combining marks, so we actually check for
3266         //           OP CM* SP*
3267         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3268         //       sequence into a ID char, so before scanning back through spaces,
3269         //       verify that prevChar is indeed a space.  The prevChar variable
3270         //       may differ from fText[prevPos]
3271         tPos = prevPos;
3272         if (fSP->contains(prevChar)) {
3273             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3274                 tPos=fText->moveIndex32(tPos, -1);
3275             }
3276         }
3277         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3278             tPos=fText->moveIndex32(tPos, -1);
3279         }
3280         if (fOP->contains(fText->char32At(tPos))) {
3281             continue;
3282         }
3283
3284
3285         // LB 15    QU SP* x OP
3286         if (fOP->contains(thisChar)) {
3287             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3288             int tPos = prevPos;
3289             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3290                 tPos = fText->moveIndex32(tPos, -1);
3291             }
3292             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3293                 tPos = fText->moveIndex32(tPos, -1);
3294             }
3295             if (fQU->contains(fText->char32At(tPos))) {
3296                 continue;
3297             }
3298         }
3299
3300
3301
3302         // LB 16   (CL | CP) SP* x NS
3303         //    Scan backwards for SP* CM* (CL | CP)
3304         if (fNS->contains(thisChar)) {
3305             int tPos = prevPos;
3306             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3307                 tPos = fText->moveIndex32(tPos, -1);
3308             }
3309             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3310                 tPos = fText->moveIndex32(tPos, -1);
3311             }
3312             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3313                 continue;
3314             }
3315         }
3316
3317
3318         // LB 17        B2 SP* x B2
3319         if (fB2->contains(thisChar)) {
3320             //  Scan backwards, checking for the B2 CM* SP* sequence.
3321             tPos = prevPos;
3322             if (fSP->contains(prevChar)) {
3323                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3324                     tPos=fText->moveIndex32(tPos, -1);
3325                 }
3326             }
3327             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3328                 tPos=fText->moveIndex32(tPos, -1);
3329             }
3330             if (fB2->contains(fText->char32At(tPos))) {
3331                 continue;
3332             }
3333         }
3334
3335
3336         // LB 18    break after space
3337         if (fSP->contains(prevChar)) {
3338             break;
3339         }
3340
3341         // LB 19
3342         //    x   QU
3343         //    QU  x
3344         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3345             continue;
3346         }
3347
3348         // LB 20  Break around a CB
3349         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3350             break;
3351         }
3352
3353         // LB 21
3354         if (fBA->contains(thisChar) ||
3355             fHY->contains(thisChar) ||
3356             fNS->contains(thisChar) ||
3357             fBB->contains(prevChar) )   {
3358             continue;
3359         }
3360
3361         // LB 21a
3362         //   HL (HY | BA) x
3363         if (fHL->contains(prevCharX2) &&
3364                 (fHY->contains(prevChar) || fBA->contains(prevChar))) {
3365             continue;
3366         }
3367
3368         // LB 21b
3369         //   SY x HL
3370         if (fSY->contains(prevChar) && fHL->contains(thisChar)) {
3371             continue;
3372         }
3373
3374         // LB 22
3375         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3376             (fHL->contains(prevChar) && fIN->contains(thisChar)) ||
3377             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3378             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3379             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3380             continue;
3381         }
3382
3383
3384         // LB 23    ID x PO
3385         //          AL x NU
3386         //          HL x NU
3387         //          NU x AL
3388         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3389             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3390             (fHL->contains(prevChar) && fNU->contains(thisChar)) ||
3391             (fNU->contains(prevChar) && fAL->contains(thisChar)) ||
3392             (fNU->contains(prevChar) && fHL->contains(thisChar)) )   {
3393             continue;
3394         }
3395
3396         // LB 24  Do not break between prefix and letters or ideographs.
3397         //        PR x ID
3398         //        PR x (AL | HL)
3399         //        PO x (AL | HL)
3400         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3401             (fPR->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) ||
3402             (fPO->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))))  {
3403             continue;
3404         }
3405
3406
3407
3408         // LB 25    Numbers
3409         if (fNumberMatcher->lookingAt(prevPos, status)) {
3410             if (U_FAILURE(status)) {
3411                 break;
3412             }
3413             // Matched a number.  But could have been just a single digit, which would
3414             //    not represent a "no break here" between prevChar and thisChar
3415             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3416             if (numEndIdx > pos) {
3417                 // Number match includes at least our two chars being checked
3418                 if (numEndIdx > nextPos) {
3419                     // Number match includes additional chars.  Update pos and nextPos
3420                     //   so that next loop iteration will continue at the end of the number,
3421                     //   checking for breaks between last char in number & whatever follows.
3422                     pos = nextPos = numEndIdx;
3423                     do {
3424                         pos = fText->moveIndex32(pos, -1);
3425                         thisChar = fText->char32At(pos);
3426                     } while (fCM->contains(thisChar));
3427                 }
3428                 continue;
3429             }
3430         }
3431
3432
3433         // LB 26 Do not break a Korean syllable.
3434         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3435                                         fJV->contains(thisChar) ||
3436                                         fH2->contains(thisChar) ||
3437                                         fH3->contains(thisChar))) {
3438                                             continue;
3439                                         }
3440
3441         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3442             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3443                 continue;
3444         }
3445
3446         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3447             fJT->contains(thisChar)) {
3448                 continue;
3449         }
3450
3451         // LB 27 Treat a Korean Syllable Block the same as ID.
3452         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3453             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3454             fIN->contains(thisChar)) {
3455                 continue;
3456             }
3457         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3458             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3459             fPO->contains(thisChar)) {
3460                 continue;
3461             }
3462         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3463             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3464                 continue;
3465             }
3466
3467
3468
3469         // LB 28  Do not break between alphabetics ("at").
3470         if ((fAL->contains(prevChar) || fHL->contains(prevChar)) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3471             continue;
3472         }
3473
3474         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3475         if (fIS->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar))) {
3476             continue;
3477         }
3478
3479         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3480         //          (AL | NU) x OP
3481         //          CP x (AL | NU)
3482         if ((fAL->contains(prevChar) || fHL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3483             continue;
3484         }
3485         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fHL->contains(thisChar) || fNU->contains(thisChar))) {
3486             continue;
3487         }
3488
3489         // LB30a  Do not break between regional indicators.
3490         //        RI x RI
3491         if (fRI->contains(prevChar) && fRI->contains(thisChar)) {
3492             continue;
3493         }
3494
3495         // LB 31    Break everywhere else
3496         break;
3497
3498     }
3499
3500     return pos;
3501 }
3502
3503
3504 UVector  *RBBILineMonkey::charClasses() {
3505     return fSets;
3506 }
3507
3508
3509 RBBILineMonkey::~RBBILineMonkey() {
3510     delete fSets;
3511
3512     delete fBK;
3513     delete fCR;
3514     delete fLF;
3515     delete fCM;
3516     delete fNL;
3517     delete fWJ;
3518     delete fZW;
3519     delete fGL;
3520     delete fCB;
3521     delete fSP;
3522     delete fB2;
3523     delete fBA;
3524     delete fBB;
3525     delete fHY;
3526     delete fH2;
3527     delete fH3;
3528     delete fCL;
3529     delete fCP;
3530     delete fEX;
3531     delete fIN;
3532     delete fJL;
3533     delete fJV;
3534     delete fJT;
3535     delete fNS;
3536     delete fOP;
3537     delete fQU;
3538     delete fIS;
3539     delete fNU;
3540     delete fPO;
3541     delete fPR;
3542     delete fSY;
3543     delete fAI;
3544     delete fAL;
3545     delete fCJ;
3546     delete fHL;
3547     delete fID;
3548     delete fRI;
3549     delete fSA;
3550     delete fSG;
3551     delete fXX;
3552
3553     delete fCharBI;
3554     delete fNumberMatcher;
3555 }
3556
3557
3558 //-------------------------------------------------------------------------------------------
3559 //
3560 //   TestMonkey
3561 //
3562 //     params
3563 //       seed=nnnnn        Random number starting seed.
3564 //                         Setting the seed allows errors to be reproduced.
3565 //       loop=nnn          Looping count.  Controls running time.
3566 //                         -1:  run forever.
3567 //                          0 or greater:  run length.
3568 //
3569 //       type = char | word | line | sent | title
3570 //
3571 //-------------------------------------------------------------------------------------------
3572
3573 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3574     int32_t val = defaultVal;
3575     name.append(" *= *(-?\\d+)");
3576     UErrorCode status = U_ZERO_ERROR;
3577     RegexMatcher m(name, params, 0, status);
3578     if (m.find()) {
3579         // The param exists.  Convert the string to an int.
3580         char valString[100];
3581         int32_t paramLength = m.end(1, status) - m.start(1, status);
3582         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3583             paramLength = (int32_t)(sizeof(valString)-2);
3584         }
3585         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3586         val = strtol(valString,  NULL, 10);
3587
3588         // Delete this parameter from the params string.
3589         m.reset();
3590         params = m.replaceFirst("", status);
3591     }
3592     U_ASSERT(U_SUCCESS(status));
3593     return val;
3594 }
3595 #endif
3596
3597 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3598 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3599                                     BreakIterator *bi,
3600                                     int expected[],
3601                                     int expectedcount)
3602 {
3603     int count = 0;
3604     int i = 0;
3605     int forward[50];
3606     bi->setText(ustr);
3607     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3608         forward[count] = i;
3609         if (count < expectedcount && expected[count] != i) {
3610             test->errln("break forward test failed: expected %d but got %d",
3611                         expected[count], i);
3612             break;
3613         }
3614         count ++;
3615     }
3616     if (count != expectedcount) {
3617         printStringBreaks(ustr, expected, expectedcount);
3618         test->errln("break forward test failed: missed %d match",
3619                     expectedcount - count);
3620         return;
3621     }
3622     // testing boundaries
3623     for (i = 1; i < expectedcount; i ++) {
3624         int j = expected[i - 1];
3625         if (!bi->isBoundary(j)) {
3626             printStringBreaks(ustr, expected, expectedcount);
3627             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3628             return;
3629         }
3630         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3631             if (bi->isBoundary(j)) {
3632                 printStringBreaks(ustr, expected, expectedcount);
3633                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3634                 return;
3635             }
3636         }
3637     }
3638
3639     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3640         count --;
3641         if (forward[count] != i) {
3642             printStringBreaks(ustr, expected, expectedcount);
3643             test->errln("happy break test previous() failed: expected %d but got %d",
3644                         forward[count], i);
3645             break;
3646         }
3647     }
3648     if (count != 0) {
3649         printStringBreaks(ustr, expected, expectedcount);
3650         test->errln("break test previous() failed: missed a match");
3651         return;
3652     }
3653
3654     // testing preceding
3655     for (i = 0; i < expectedcount - 1; i ++) {
3656         // int j = expected[i] + 1;
3657         int j = ustr.moveIndex32(expected[i], 1);
3658         for (; j <= expected[i + 1]; j ++) {
3659             if (bi->preceding(j) != expected[i]) {
3660                 printStringBreaks(ustr, expected, expectedcount);
3661                 test->errln("preceding(): Not expecting boundary at position %d", j);
3662                 return;
3663             }
3664         }
3665     }
3666 }
3667 #endif
3668
3669 void RBBITest::TestWordBreaks(void)
3670 {
3671 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3672
3673     Locale        locale("en");
3674     UErrorCode    status = U_ZERO_ERROR;
3675     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3676     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3677     // Replaced any C+J characters in a row with a random sequence of characters
3678     // of the same length to make our C+J segmentation not get in the way.
3679     static const char *strlist[] =
3680     {
3681     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
3682     "\\U000e0037\\u2666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u091c\\U000e0040\\u003b",
3683     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
3684     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
3685     "\\uac00\\u3588\\u009c\\u0953\\u194b",
3686     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3687     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
3688     "\\u2f1f\\u1634\\u05f8\\u0944\\u04f2\\u0cdf\\u1f9c\\u05f4\\u002e",
3689     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3690     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3691     "\\u2027\\U000e0067\\u0a47\\u00b7",
3692     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3693     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3694     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3695     "\\u0f66\\u2523\\u003a\\u0cae\\U000e0047\\u003a",
3696     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3697     "\\u0027\\u11af\\U000e0057\\u0602",
3698     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3699     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3700     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3701     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3702     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3703     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
3704     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3705     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3706     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3707     "\\u18f4\\U000e0049\\u20e7\\u2027",
3708     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3709     "\\ua183\\u102d\\u0bec\\u003a",
3710     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3711     "\\u003a\\u0e57\\u0fad\\u002e",
3712     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3713     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3714     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
3715     "\\u003a\\u0664\\u00b7\\u1fba",
3716     "\\u003b\\u0027\\u00b7\\u47a3",
3717     "\\u2027\\U000e0067\\u0a42\\u00b7\\u4edf\\uc26c\\u003a\\u4186\\u041b",
3718     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
3719     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
3720     };
3721     int loop;
3722     if (U_FAILURE(status)) {
3723         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3724         return;
3725     }
3726     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3727         // printf("looping %d\n", loop);
3728         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
3729         // RBBICharMonkey monkey;
3730         RBBIWordMonkey monkey;
3731
3732         int expected[50];
3733         int expectedcount = 0;
3734
3735         monkey.setText(ustr);
3736         int i;
3737         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3738             expected[expectedcount ++] = i;
3739         }
3740
3741         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3742     }
3743     delete bi;
3744 #endif
3745 }
3746
3747 void RBBITest::TestWordBoundary(void)
3748 {
3749     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
3750     Locale        locale("en");
3751     UErrorCode    status = U_ZERO_ERROR;
3752     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
3753     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
3754     UChar         str[50];
3755     static const char *strlist[] =
3756     {
3757     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
3758     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
3759     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
3760     "\\u2027\\U000e0067\\u0a47\\u00b7",
3761     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
3762     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
3763     "\\u0589\\U000e006e\\u0a42\\U000104a5",
3764     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
3765     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
3766     "\\u0027\\u11af\\U000e0057\\u0602",
3767     "\\U0001d7f2\\U000e007\\u0004\\u0589",
3768     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
3769     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
3770     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
3771     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
3772     "\\U000e0065\\u302c\\u09ee\\U000e0068",
3773     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
3774     "\\u0233\\U000e0020\\u0a69\\u0d6a",
3775     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
3776     "\\u58f4\\U000e0049\\u20e7\\u2027",
3777     "\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
3778     "\\ua183\\u102d\\u0bec\\u003a",
3779     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
3780     "\\u003a\\u0e57\\u0fad\\u002e",
3781     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
3782     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
3783     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
3784     "\\u003a\\u0664\\u00b7\\u1fba",
3785     "\\u003b\\u0027\\u00b7\\u47a3",
3786     };
3787     int loop;
3788     if (U_FAILURE(status)) {
3789         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3790         return;
3791     }
3792     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3793         // printf("looping %d\n", loop);
3794         u_unescape(strlist[loop], str, 20);
3795         UnicodeString ustr(str);
3796         int forward[50];
3797         int count = 0;
3798
3799         bi->setText(ustr);
3800         int prev = 0;
3801         int i;
3802         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3803             forward[count ++] = i;
3804             if (i > prev) {
3805                 int j;
3806                 for (j = prev + 1; j < i; j ++) {
3807                     if (bi->isBoundary(j)) {
3808                         printStringBreaks(ustr, forward, count);
3809                         errln("happy boundary test failed: expected %d not a boundary",
3810                                j);
3811                         return;
3812                     }
3813                 }
3814             }
3815             if (!bi->isBoundary(i)) {
3816                 printStringBreaks(ustr, forward, count);
3817                 errln("happy boundary test failed: expected %d a boundary",
3818                        i);
3819                 return;
3820             }
3821             prev = i;
3822         }
3823     }
3824     delete bi;
3825 }
3826
3827 void RBBITest::TestLineBreaks(void)
3828 {
3829 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3830     Locale        locale("en");
3831     UErrorCode    status = U_ZERO_ERROR;
3832     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
3833     const int32_t  STRSIZE = 50;
3834     UChar         str[STRSIZE];
3835     static const char *strlist[] =
3836     {
3837      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
3838      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
3839              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
3840      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
3841              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
3842      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
3843      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3844      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
3845      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
3846      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
3847      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
3848      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
3849      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
3850      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
3851      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
3852      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
3853      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
3854      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
3855      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
3856      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
3857      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
3858      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
3859      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
3860      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
3861      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
3862      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
3863      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
3864      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
3865      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
3866      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
3867      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
3868      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
3869      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
3870      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
3871      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
3872      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
3873      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
3874      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
3875      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
3876      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
3877      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
3878      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
3879          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
3880          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
3881          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
3882      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
3883          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
3884     };
3885     int loop;
3886     TEST_ASSERT_SUCCESS(status);
3887     if (U_FAILURE(status)) {
3888         return;
3889     }
3890     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3891         // printf("looping %d\n", loop);
3892         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
3893         if (t >= STRSIZE) {
3894             TEST_ASSERT(FALSE);
3895             continue;
3896         }
3897
3898
3899         UnicodeString ustr(str);
3900         RBBILineMonkey monkey;
3901         if (U_FAILURE(monkey.deferredStatus)) {
3902             continue;
3903         }
3904
3905         const int EXPECTEDSIZE = 50;
3906         int expected[EXPECTEDSIZE];
3907         int expectedcount = 0;
3908
3909         monkey.setText(ustr);
3910         int i;
3911         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3912             if (expectedcount >= EXPECTEDSIZE) {
3913                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3914                 return;
3915             }
3916             expected[expectedcount ++] = i;
3917         }
3918
3919         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3920     }
3921     delete bi;
3922 #endif
3923 }
3924
3925 void RBBITest::TestSentBreaks(void)
3926 {
3927 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3928     Locale        locale("en");
3929     UErrorCode    status = U_ZERO_ERROR;
3930     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
3931     UChar         str[200];
3932     static const char *strlist[] =
3933     {
3934      "Now\ris\nthe\r\ntime\n\rfor\r\r",
3935      "This\n",
3936      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
3937      "\"Sentence ending with a quote.\" Bye.",
3938      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
3939      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
3940      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
3941      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
3942      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
3943      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
3944      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
3945              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
3946              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
3947              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
3948      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
3949              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
3950              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
3951              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
3952              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
3953              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
3954     };
3955     int loop;
3956     if (U_FAILURE(status)) {
3957         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
3958         return;
3959     }
3960     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
3961         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
3962         UnicodeString ustr(str);
3963
3964         RBBISentMonkey monkey;
3965         if (U_FAILURE(monkey.deferredStatus)) {
3966             continue;
3967         }
3968
3969         const int EXPECTEDSIZE = 50;
3970         int expected[EXPECTEDSIZE];
3971         int expectedcount = 0;
3972
3973         monkey.setText(ustr);
3974         int i;
3975         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
3976             if (expectedcount >= EXPECTEDSIZE) {
3977                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
3978                 return;
3979             }
3980             expected[expectedcount ++] = i;
3981         }
3982
3983         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
3984     }
3985     delete bi;
3986 #endif
3987 }
3988
3989 void RBBITest::TestMonkey(char *params) {
3990 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
3991
3992     UErrorCode     status    = U_ZERO_ERROR;
3993     int32_t        loopCount = 500;
3994     int32_t        seed      = 1;
3995     UnicodeString  breakType = "all";
3996     Locale         locale("en");
3997     UBool          useUText  = FALSE;
3998
3999     if (quick == FALSE) {
4000         loopCount = 10000;
4001     }
4002
4003     if (params) {
4004         UnicodeString p(params);
4005         loopCount = getIntParam("loop", p, loopCount);
4006         seed      = getIntParam("seed", p, seed);
4007
4008         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4009         if (m.find()) {
4010             breakType = m.group(1, status);
4011             m.reset();
4012             p = m.replaceFirst("", status);
4013         }
4014
4015         RegexMatcher u(" *utext", p, 0, status);
4016         if (u.find()) {
4017             useUText = TRUE;
4018             u.reset();
4019             p = u.replaceFirst("", status);
4020         }
4021
4022
4023         // m.reset(p);
4024         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4025             // Each option is stripped out of the option string as it is processed.
4026             // All options have been checked.  The option string should have been completely emptied..
4027             char buf[100];
4028             p.extract(buf, sizeof(buf), NULL, status);
4029             buf[sizeof(buf)-1] = 0;
4030             errln("Unrecognized or extra parameter:  %s\n", buf);
4031             return;
4032         }
4033
4034     }
4035
4036     if (breakType == "char" || breakType == "all") {
4037         RBBICharMonkey  m;
4038         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4039         if (U_SUCCESS(status)) {
4040             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4041             if (breakType == "all" && useUText==FALSE) {
4042                 // Also run a quick test with UText when "all" is specified
4043                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4044             }
4045         }
4046         else {
4047             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4048         }
4049         delete bi;
4050     }
4051
4052     if (breakType == "word" || breakType == "all") {
4053         logln("Word Break Monkey Test");
4054         RBBIWordMonkey  m;
4055         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4056         if (U_SUCCESS(status)) {
4057             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4058         }
4059         else {
4060             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4061         }
4062         delete bi;
4063     }
4064
4065     if (breakType == "line" || breakType == "all") {
4066         logln("Line Break Monkey Test");
4067         RBBILineMonkey  m;
4068         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4069         if (loopCount >= 10) {
4070             loopCount = loopCount / 5;   // Line break runs slower than the others.
4071         }
4072         if (U_SUCCESS(status)) {
4073             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4074         }
4075         else {
4076             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4077         }
4078         delete bi;
4079     }
4080
4081     if (breakType == "sent" || breakType == "all"  ) {
4082         logln("Sentence Break Monkey Test");
4083         RBBISentMonkey  m;
4084         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4085         if (loopCount >= 10) {
4086             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4087         }
4088         if (U_SUCCESS(status)) {
4089             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4090         }
4091         else {
4092             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4093         }
4094         delete bi;
4095     }
4096
4097 #endif
4098 }
4099
4100 //
4101 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4102 //    Parameters:
4103 //       bi      - the break iterator to use
4104 //       mk      - MonkeyKind, abstraction for obtaining expected results
4105 //       name    - Name of test (char, word, etc.) for use in error messages
4106 //       seed    - Seed for starting random number generator (parameter from user)
4107 //       numIterations
4108 //
4109 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4110                          int32_t numIterations, UBool useUText) {
4111
4112 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4113
4114     const int32_t    TESTSTRINGLEN = 500;
4115     UnicodeString    testText;
4116     int32_t          numCharClasses;
4117     UVector          *chClasses;
4118     int              expected[TESTSTRINGLEN*2 + 1];
4119     int              expectedCount = 0;
4120     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4121     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4122     char             reverseBreaks[TESTSTRINGLEN*2+1];
4123     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4124     char             followingBreaks[TESTSTRINGLEN*2+1];
4125     char             precedingBreaks[TESTSTRINGLEN*2+1];
4126     int              i;
4127     int              loopCount = 0;
4128
4129     m_seed = seed;
4130
4131     numCharClasses = mk.charClasses()->size();
4132     chClasses      = mk.charClasses();
4133
4134     // Check for errors that occured during the construction of the MonkeyKind object.
4135     //  Can't report them where they occured because errln() is a method coming from intlTest,
4136     //  and is not visible outside of RBBITest :-(
4137     if (U_FAILURE(mk.deferredStatus)) {
4138         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4139         return;
4140     }
4141
4142     // Verify that the character classes all have at least one member.
4143     for (i=0; i<numCharClasses; i++) {
4144         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4145         if (s == NULL || s->size() == 0) {
4146             errln("Character Class #%d is null or of zero size.", i);
4147             return;
4148         }
4149     }
4150
4151     while (loopCount < numIterations || numIterations == -1) {
4152         if (numIterations == -1 && loopCount % 10 == 0) {
4153             // If test is running in an infinite loop, display a periodic tic so
4154             //   we can tell that it is making progress.
4155             fprintf(stderr, ".");
4156         }
4157         // Save current random number seed, so that we can recreate the random numbers
4158         //   for this loop iteration in event of an error.
4159         seed = m_seed;
4160
4161         // Populate a test string with data.
4162         testText.truncate(0);
4163         for (i=0; i<TESTSTRINGLEN; i++) {
4164             int32_t  aClassNum = m_rand() % numCharClasses;
4165             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4166             int32_t   charIdx = m_rand() % classSet->size();
4167             UChar32   c = classSet->charAt(charIdx);
4168             if (c < 0) {   // TODO:  deal with sets containing strings.
4169                 errln("c < 0");
4170                 break;
4171             }
4172             testText.append(c);
4173         }
4174
4175         // Calculate the expected results for this test string.
4176         mk.setText(testText);
4177         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4178         expectedBreaks[0] = 1;
4179         int32_t breakPos = 0;
4180         expectedCount = 0;
4181         for (;;) {
4182             breakPos = mk.next(breakPos);
4183             if (breakPos == -1) {
4184                 break;
4185             }
4186             if (breakPos > testText.length()) {
4187                 errln("breakPos > testText.length()");
4188             }
4189             expectedBreaks[breakPos] = 1;
4190             U_ASSERT(expectedCount<testText.length());
4191             expected[expectedCount ++] = breakPos;
4192             (void)expected;   // Set but not used warning.
4193                               // TODO (andy): check it out.
4194         }
4195
4196         // Find the break positions using forward iteration
4197         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4198         if (useUText) {
4199             UErrorCode status = U_ZERO_ERROR;
4200             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4201             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4202             bi->setText(testUText, status);
4203             TEST_ASSERT_SUCCESS(status);
4204             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4205                                       //  This UText can be closed immediately, so long as the
4206                                       //  testText string continues to exist.
4207         } else {
4208             bi->setText(testText);
4209         }
4210
4211         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4212             if (i < 0 || i > testText.length()) {
4213                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4214                 break;
4215             }
4216             forwardBreaks[i] = 1;
4217         }
4218
4219         // Find the break positions using reverse iteration
4220         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4221         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4222             if (i < 0 || i > testText.length()) {
4223                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4224                 break;
4225             }
4226             reverseBreaks[i] = 1;
4227         }
4228
4229         // Find the break positions using isBoundary() tests.
4230         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4231         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4232         for (i=0; i<=testText.length(); i++) {
4233             isBoundaryBreaks[i] = bi->isBoundary(i);
4234         }
4235
4236
4237         // Find the break positions using the following() function.
4238         // printf(".");
4239         memset(followingBreaks, 0, sizeof(followingBreaks));
4240         int32_t   lastBreakPos = 0;
4241         followingBreaks[0] = 1;
4242         for (i=0; i<testText.length(); i++) {
4243             breakPos = bi->following(i);
4244             if (breakPos <= i ||
4245                 breakPos < lastBreakPos ||
4246                 breakPos > testText.length() ||
4247                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4248                 UChar32 brkChar = testText.char32At(lastBreakPos);
4249                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4250                 errln("%s break monkey test: "
4251                     "Out of range value returned by BreakIterator::following().\n"
4252                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4253                          name, seed, i, breakPos, lastBreakPos);
4254                 }
4255                 break;
4256             }
4257             followingBreaks[breakPos] = 1;
4258             lastBreakPos = breakPos;
4259         }
4260
4261         // Find the break positions using the preceding() function.
4262         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4263         lastBreakPos = testText.length();
4264         precedingBreaks[testText.length()] = 1;
4265         for (i=testText.length(); i>0; i--) {
4266             breakPos = bi->preceding(i);
4267             if (breakPos >= i ||
4268                 breakPos > lastBreakPos ||
4269                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4270                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4271                 UChar32 brkChar = testText.char32At(breakPos);
4272                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4273                 errln("%s break monkey test: "
4274                     "Out of range value returned by BreakIterator::preceding().\n"
4275                     "index=%d;  prev returned %d; lastBreak=%d" ,
4276                     name,  i, breakPos, lastBreakPos);
4277                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4278                     precedingBreaks[i] = 2;   // Forces an error.
4279                 }
4280                 }
4281             } else {
4282                 if (breakPos >= 0) {
4283                     precedingBreaks[breakPos] = 1;
4284                 }
4285                 lastBreakPos = breakPos;
4286             }
4287         }
4288
4289         // Compare the expected and actual results.
4290         for (i=0; i<=testText.length(); i++) {
4291             const char *errorType = NULL;
4292             if  (forwardBreaks[i] != expectedBreaks[i]) {
4293                 errorType = "next()";
4294             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4295                 errorType = "previous()";
4296             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4297                 errorType = "isBoundary()";
4298             } else if (followingBreaks[i] != expectedBreaks[i]) {
4299                 errorType = "following()";
4300             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4301                 errorType = "preceding()";
4302             }
4303
4304
4305             if (errorType != NULL) {
4306                 // Format a range of the test text that includes the failure as
4307                 //  a data item that can be included in the rbbi test data file.
4308
4309                 // Start of the range is the last point where expected and actual results
4310                 //   both agreed that there was a break position.
4311                 int startContext = i;
4312                 int32_t count = 0;
4313                 for (;;) {
4314                     if (startContext==0) { break; }
4315                     startContext --;
4316                     if (expectedBreaks[startContext] != 0) {
4317                         if (count == 2) break;
4318                         count ++;
4319                     }
4320                 }
4321
4322                 // End of range is two expected breaks past the start position.
4323                 int endContext = i + 1;
4324                 int ci;
4325                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4326                     for (;;) {
4327                         if (endContext >= testText.length()) {break;}
4328                         if (expectedBreaks[endContext-1] != 0) {
4329                             if (count == 0) break;
4330                             count --;
4331                         }
4332                         endContext ++;
4333                     }
4334                 }
4335
4336                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4337                 UnicodeString errorText = "<data>";
4338                 /***if (strcmp(errorType, "next()") == 0) {
4339                     startContext = 0;
4340                     endContext = testText.length();
4341
4342                     printStringBreaks(testText, expected, expectedCount);
4343                 }***/
4344
4345                 for (ci=startContext; ci<endContext;) {
4346                     UnicodeString hexChars("0123456789abcdef");
4347                     UChar32  c;
4348                     int      bn;
4349                     c = testText.char32At(ci);
4350                     if (ci == i) {
4351                         // This is the location of the error.
4352                         errorText.append("<?>");
4353                     } else if (expectedBreaks[ci] != 0) {
4354                         // This a non-error expected break position.
4355                         errorText.append("\\");
4356                     }
4357                     if (c < 0x10000) {
4358                         errorText.append("\\u");
4359                         for (bn=12; bn>=0; bn-=4) {
4360                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4361                         }
4362                     } else {
4363                         errorText.append("\\U");
4364                         for (bn=28; bn>=0; bn-=4) {
4365                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4366                         }
4367                     }
4368                     ci = testText.moveIndex32(ci, 1);
4369                 }
4370                 errorText.append("\\");
4371                 errorText.append("</data>\n");
4372
4373                 // Output the error
4374                 char  charErrorTxt[500];
4375                 UErrorCode status = U_ZERO_ERROR;
4376                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4377                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4378                 const char *badLocale = bi->getLocaleID(ULOC_ACTUAL_LOCALE, status);
4379
4380                 UChar32 brkChar = testText.char32At(i);
4381                 if ((strcmp(name, "char") != 0 && strcmp(name, "word") != 0) || brkChar < 0x1F1E6 || brkChar > 0x1F1FF) { // Apple, skip RI char/word break monkey tests
4382                 errln("%s break monkey test error [%s].  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4383                     name, badLocale, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4384                     errorType, seed, i, charErrorTxt);
4385                 }
4386                 break;
4387             }
4388         }
4389
4390         loopCount++;
4391     }
4392 #endif
4393 }
4394
4395
4396 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4397 //             This test checks the initial patch,
4398 //             which is to just keep it from crashing.  Correct word boundaries
4399 //             await a proper fix to the dictionary code.
4400 //
4401 void RBBITest::TestBug5532(void)  {
4402    // Text includes a mixture of Thai and Latin.
4403    const unsigned char utf8Data[] = {
4404            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4405            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4406            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4407            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4408            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4409            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4410            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4411            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4412            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4413            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4414            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4415
4416     UErrorCode status = U_ZERO_ERROR;
4417     UText utext=UTEXT_INITIALIZER;
4418     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4419     TEST_ASSERT_SUCCESS(status);
4420
4421     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4422     TEST_ASSERT_SUCCESS(status);
4423     if (U_SUCCESS(status)) {
4424         bi->setText(&utext, status);
4425         TEST_ASSERT_SUCCESS(status);
4426
4427         int32_t breakCount = 0;
4428         int32_t previousBreak = -1;
4429         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4430             // For now, just make sure that the break iterator doesn't hang.
4431             TEST_ASSERT(previousBreak < bi->current());
4432             previousBreak = bi->current();
4433         }
4434         TEST_ASSERT(breakCount > 0);
4435     }
4436     delete bi;
4437     utext_close(&utext);
4438 }
4439
4440
4441 void RBBITest::TestBug9983(void)  {
4442     UnicodeString text = UnicodeString("\\u002A"  // * Other
4443                                        "\\uFF65"  //   Other
4444                                        "\\u309C"  //   Katakana
4445                                        "\\uFF9F"  //   Extend
4446                                        "\\uFF65"  //   Other
4447                                        "\\u0020"  //   Other
4448                                        "\\u0000").unescape();
4449
4450     UErrorCode status = U_ZERO_ERROR;
4451     LocalPointer<RuleBasedBreakIterator> brkiter(static_cast<RuleBasedBreakIterator *>(
4452         BreakIterator::createWordInstance(Locale::getRoot(), status)));
4453     TEST_ASSERT_SUCCESS(status);
4454     LocalPointer<RuleBasedBreakIterator> brkiterPOSIX(static_cast<RuleBasedBreakIterator *>(
4455         BreakIterator::createWordInstance(Locale::createFromName("en_US_POSIX"), status)));
4456     TEST_ASSERT_SUCCESS(status);
4457     if (U_FAILURE(status)) {
4458         return;
4459     }
4460     int32_t offset, rstatus, iterationCount;
4461
4462     brkiter->setText(text);
4463     brkiter->last();
4464     iterationCount = 0;
4465     while ( (offset = brkiter->previous()) != UBRK_DONE ) {
4466         iterationCount++;
4467         rstatus = brkiter->getRuleStatus();
4468         (void)rstatus;     // Suppress set but not used warning.
4469         if (iterationCount >= 10) {
4470            break;
4471         }
4472     }
4473     TEST_ASSERT(iterationCount == 6);
4474
4475     brkiterPOSIX->setText(text);
4476     brkiterPOSIX->last();
4477     iterationCount = 0;
4478     while ( (offset = brkiterPOSIX->previous()) != UBRK_DONE ) {
4479         iterationCount++;
4480         rstatus = brkiterPOSIX->getRuleStatus();
4481         (void)rstatus;     // Suppress set but not used warning.
4482         if (iterationCount >= 10) {
4483            break;
4484         }
4485     }
4486     TEST_ASSERT(iterationCount == 6);
4487 }
4488
4489
4490 //
4491 //  TestDebug    -  A place-holder test for debugging purposes.
4492 //                  For putting in fragments of other tests that can be invoked
4493 //                  for tracing  without a lot of unwanted extra stuff happening.
4494 //
4495 void RBBITest::TestDebug(void) {
4496 #if 0
4497     UErrorCode   status = U_ZERO_ERROR;
4498     int pos = 0;
4499     int ruleStatus = 0;
4500
4501     RuleBasedBreakIterator* bi =
4502        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4503        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4504        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4505     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4506     // UnicodeString s("Aaa.  Bcd");
4507     s = s.unescape();
4508     bi->setText(s);
4509     UBool r = bi->isBoundary(8);
4510     printf("%s", r?"true":"false");
4511     return;
4512     pos = bi->last();
4513     do {
4514         // ruleStatus = bi->getRuleStatus();
4515         printf("%d\t%d\n", pos, ruleStatus);
4516         pos = bi->previous();
4517     } while (pos != BreakIterator::DONE);
4518 #endif
4519 }
4520
4521 void RBBITest::TestProperties() {
4522     UErrorCode errorCode = U_ZERO_ERROR;
4523     UnicodeSet prependSet(UNICODE_STRING_SIMPLE("[:GCB=Prepend:]"), errorCode);
4524     if (!prependSet.isEmpty()) {
4525         errln(
4526             "[:GCB=Prepend:] is not empty any more. "
4527             "Uncomment relevant lines in source/data/brkitr/char.txt and "
4528             "change this test to the opposite condition.");
4529     }
4530 }
4531
4532 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */