icuSources/test/intltest/rbbitst.cpp

   1 /********************************************************************
   2  * COPYRIGHT:
   3  * Copyright (c) 1999-2011, International Business Machines Corporation and
   4  * others. All Rights Reserved.
   5  ********************************************************************/
   6 /************************************************************************
   7 *   Date        Name        Description
   8 *   12/15/99    Madhu        Creation.
   9 *   01/12/2000  Madhu        Updated for changed API and added new tests
  10 ************************************************************************/
  11
  12 #include <typeinfo>  // for 'typeid' to work
  13
  14 #include "unicode/utypes.h"
  15
  16 #if !UCONFIG_NO_BREAK_ITERATION
  17
  18 #include "unicode/utypes.h"
  19 #include "unicode/brkiter.h"
  20 #include "unicode/rbbi.h"
  21 #include "unicode/uchar.h"
  22 #include "unicode/utf16.h"
  23 #include "unicode/ucnv.h"
  24 #include "unicode/schriter.h"
  25 #include "unicode/uniset.h"
  26 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
  27 #include "unicode/ustring.h"
  28 #include "unicode/utext.h"
  29 #include "intltest.h"
  30 #include "rbbitst.h"
  31 #include <string.h>
  32 #include "uvector.h"
  33 #include "uvectr32.h"
  34 #include "triedict.h"
  35 #include <string.h>
  36 #include <stdio.h>
  37 #include <stdlib.h>
  38
  39 #define TEST_ASSERT(x) {if (!(x)) { \
  40     errln("Failure in file %s, line %d", __FILE__, __LINE__);}}
  41
  42 #define TEST_ASSERT_SUCCESS(errcode) { if (U_FAILURE(errcode)) { \
  43     errcheckln(errcode, "Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(errcode));}}
  44
  45
  46 //---------------------------------------------
  47 // runIndexedTest
  48 //---------------------------------------------
  49
  50 void RBBITest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* params )
  51 {
  52     if (exec) logln("TestSuite RuleBasedBreakIterator: ");
  53
  54     switch (index) {
  55 #if !UCONFIG_NO_FILE_IO
  56         case 0: name = "TestBug4153072";
  57             if(exec) TestBug4153072();                         break;
  58 #else
  59         case 0: name = "skip";
  60             break;
  61 #endif
  62
  63         case 1: name = "TestJapaneseLineBreak";
  64             if(exec) TestJapaneseLineBreak();                  break;
  65         case 2: name = "TestStatusReturn";
  66             if(exec) TestStatusReturn();                       break;
  67
  68 #if !UCONFIG_NO_FILE_IO
  69         case 3: name = "TestUnicodeFiles";
  70             if(exec) TestUnicodeFiles();                       break;
  71         case 4: name = "TestEmptyString";
  72             if(exec) TestEmptyString();                        break;
  73 #else
  74         case 3: case 4: name = "skip";
  75             break;
  76 #endif
  77
  78         case 5: name = "TestGetAvailableLocales";
  79             if(exec) TestGetAvailableLocales();                break;
  80
  81         case 6: name = "TestGetDisplayName";
  82             if(exec) TestGetDisplayName();                     break;
  83
  84 #if !UCONFIG_NO_FILE_IO
  85         case 7: name = "TestEndBehaviour";
  86             if(exec) TestEndBehaviour();                       break;
  87         case 8: name = "TestMixedThaiLineBreak";
  88              if(exec) TestMixedThaiLineBreak();                break;
  89         case 9: name = "TestThaiLineBreak";
  90              if(exec) TestThaiLineBreak();                     break;
  91         case 10: name = "TestMaiyamok";
  92              if(exec) TestMaiyamok();                          break;
  93         case 11: name = "TestWordBreaks";
  94              if(exec) TestWordBreaks();                        break;
  95         case 12: name = "TestWordBoundary";
  96              if(exec) TestWordBoundary();                      break;
  97         case 13: name = "TestLineBreaks";
  98              if(exec) TestLineBreaks();                        break;
  99         case 14: name = "TestSentBreaks";
 100              if(exec) TestSentBreaks();                        break;
 101         case 15: name = "TestExtended";
 102              if(exec) TestExtended();                          break;
 103 #else
 104         case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: name = "skip";
 105              break;
 106 #endif
 107
 108         case 16:
 109              if(exec) {
 110  #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
 111                name = "TestMonkey";
 112                TestMonkey(params);
 113  #else
 114                name = "skip";
 115  #endif
 116              }
 117                                                                break;
 118
 119 #if !UCONFIG_NO_FILE_IO
 120         case 17: name = "TestBug3818";
 121             if(exec) TestBug3818();                            break;
 122         case 18: name = "TestJapaneseWordBreak";
 123             if(exec) TestJapaneseWordBreak();                  break;
 124 #else
 125         case 17: case 18: name = "skip";
 126             break;
 127 #endif
 128
 129         case 19: name = "TestDebug";
 130             if(exec) TestDebug();                              break;
 131         case 20: name = "TestTrieDict";
 132             if(exec) TestTrieDict();                           break;
 133
 134 #if !UCONFIG_NO_FILE_IO
 135         case 21: name = "TestBug5775";
 136             if (exec) TestBug5775();                           break;
 137         case 22: name = "TestThaiBreaks";
 138             if (exec) TestThaiBreaks();                        break;
 139         case 23: name = "TestTailoredBreaks";
 140             if (exec) TestTailoredBreaks();                    break;
 141 #else
 142         case 21: case 22: case 23: name = "skip";
 143             break;
 144 #endif
 145         case 24: name = "TestDictRules";
 146             if (exec) TestDictRules();                         break;
 147         case 25: name = "TestBug5532";
 148             if (exec) TestBug5532();                           break;
 149         default: name = ""; break; //needed to end loop
 150     }
 151 }
 152
 153
 154 //---------------------------------------------------------------------------
 155 //
 156 //   class BITestData   Holds a set of Break iterator test data and results
 157 //                      Includes
 158 //                         - the string data to be broken
 159 //                         - a vector of the expected break positions.
 160 //                         - a vector of source line numbers for the data,
 161 //                               (to help see where errors occured.)
 162 //                         - The expected break tag values.
 163 //                         - Vectors of actual break positions and tag values.
 164 //                         - Functions for comparing actual with expected and
 165 //                            reporting errors.
 166 //
 167 //----------------------------------------------------------------------------
 168 class BITestData {
 169 public:
 170     UnicodeString    fDataToBreak;
 171     UVector          fExpectedBreakPositions;
 172     UVector          fExpectedTags;
 173     UVector          fLineNum;
 174     UVector          fActualBreakPositions;   // Test Results.
 175     UVector          fActualTags;
 176
 177     BITestData(UErrorCode &status);
 178     void             addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status);
 179     void             checkResults(const char *heading, RBBITest *test);
 180     void             err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx);
 181     void             clearResults();
 182 };
 183
 184 //
 185 // Constructor.
 186 //
 187 BITestData::BITestData(UErrorCode &status)
 188 : fExpectedBreakPositions(status), fExpectedTags(status),  fLineNum(status), fActualBreakPositions(status),
 189   fActualTags(status)
 190 {
 191 }
 192
 193 //
 194 // addDataChunk.   Add a section (non-breaking) piece if data to the test data.
 195 //                 The macro form collects the line number, which is helpful
 196 //                 when tracking down failures.
 197 //
 198 //                 A null data item is inserted at the start of each test's data
 199 //                  to put the starting zero into the data list.  The position saved for
 200 //                  each non-null item is its ending position.
 201 //
 202 #define ADD_DATACHUNK(td, data, tag, status)   td.addDataChunk(data, tag, __LINE__, status);
 203 void BITestData::addDataChunk(const char *data, int32_t tag, int32_t lineNum, UErrorCode status) {
 204     if (U_FAILURE(status)) {return;}
 205     if (data != NULL) {
 206         fDataToBreak.append(CharsToUnicodeString(data));
 207     }
 208     fExpectedBreakPositions.addElement(fDataToBreak.length(), status);
 209     fExpectedTags.addElement(tag, status);
 210     fLineNum.addElement(lineNum, status);
 211 }
 212
 213
 214 //
 215 //  checkResults.   Compare the actual and expected break positions, report any differences.
 216 //
 217 void BITestData::checkResults(const char *heading, RBBITest *test) {
 218     int32_t   expectedIndex = 0;
 219     int32_t   actualIndex = 0;
 220
 221     for (;;) {
 222         // If we've run through both the expected and actual results vectors, we're done.
 223         //   break out of the loop.
 224         if (expectedIndex >= fExpectedBreakPositions.size() &&
 225             actualIndex   >= fActualBreakPositions.size()) {
 226             break;
 227         }
 228
 229
 230         if (expectedIndex >= fExpectedBreakPositions.size()) {
 231             err(heading, test, expectedIndex-1, actualIndex);
 232             actualIndex++;
 233             continue;
 234         }
 235
 236         if (actualIndex >= fActualBreakPositions.size()) {
 237             err(heading, test, expectedIndex, actualIndex-1);
 238             expectedIndex++;
 239             continue;
 240         }
 241
 242         if (fActualBreakPositions.elementAti(actualIndex) != fExpectedBreakPositions.elementAti(expectedIndex)) {
 243             err(heading, test, expectedIndex, actualIndex);
 244             // Try to resync the positions of the indices, to avoid a rash of spurious erros.
 245             if (fActualBreakPositions.elementAti(actualIndex) < fExpectedBreakPositions.elementAti(expectedIndex)) {
 246                 actualIndex++;
 247             } else {
 248                 expectedIndex++;
 249             }
 250             continue;
 251         }
 252
 253         if (fActualTags.elementAti(actualIndex) != fExpectedTags.elementAti(expectedIndex)) {
 254             test->errln("%s, tag mismatch.  Test Line = %d, expected tag=%d, got %d",
 255                 heading, fLineNum.elementAt(expectedIndex),
 256                 fExpectedTags.elementAti(expectedIndex), fActualTags.elementAti(actualIndex));
 257         }
 258
 259         actualIndex++;
 260         expectedIndex++;
 261     }
 262 }
 263
 264 //
 265 //  err   -  An error was found.  Report it, along with information about where the
 266 //                                incorrectly broken test data appeared in the source file.
 267 //
 268 void    BITestData::err(const char *heading, RBBITest *test, int32_t expectedIdx, int32_t actualIdx)
 269 {
 270     int32_t   expected = fExpectedBreakPositions.elementAti(expectedIdx);
 271     int32_t   actual   = fActualBreakPositions.elementAti(actualIdx);
 272     int32_t   o        = 0;
 273     int32_t   line     = fLineNum.elementAti(expectedIdx);
 274     if (expectedIdx > 0) {
 275         // The line numbers are off by one because a premature break occurs somewhere
 276         //    within the previous item, rather than at the start of the current (expected) item.
 277         //    We want to report the offset of the unexpected break from the start of
 278         //      this previous item.
 279         o    = actual - fExpectedBreakPositions.elementAti(expectedIdx-1);
 280     }
 281     if (actual < expected) {
 282         test->errln("%s unexpected break at offset %d in test item from line %d. actual break: %d  expected break: %d", heading, o, line, actual, expected);
 283     } else {
 284         test->errln("%s Failed to find break at end of item from line %d. actual break: %d  expected break: %d", heading, line, actual, expected);
 285     }
 286 }
 287
 288
 289 void BITestData::clearResults() {
 290     fActualBreakPositions.removeAllElements();
 291     fActualTags.removeAllElements();
 292 }
 293
 294
 295 //-----------------------------------------------------------------------------------
 296 //
 297 //    Cannned Test Characters
 298 //
 299 //-----------------------------------------------------------------------------------
 300
 301 static const UChar cannedTestArray[] = {
 302     0x0001, 0x0002, 0x0003, 0x0004, 0x0020, 0x0021, '\\', 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0028, 0x0029, 0x002b, 0x002d, 0x0030, 0x0031,
 303     0x0032, 0x0033, 0x0034, 0x003c, 0x003d, 0x003e, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x005b, 0x005d, 0x005e, 0x005f, 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x007b,
 304     0x007d, 0x007c, 0x002c, 0x00a0, 0x00a2,
 305     0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00ab, 0x00ad, 0x00ae, 0x00af, 0x00b0, 0x00b2, 0x00b3,
 306     0x00b4, 0x00b9, 0x00bb, 0x00bc, 0x00bd, 0x02b0, 0x02b1, 0x02b2, 0x02b3, 0x02b4, 0x0300, 0x0301, 0x0302, 0x0303,
 307     0x0304, 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x0903, 0x093e, 0x093f, 0x0940, 0x0949, 0x0f3a, 0x0f3b, 0x2000,
 308     0x2001, 0x2002, 0x200c, 0x200d, 0x200e, 0x200f, 0x2010, 0x2011, 0x2012, 0x2028, 0x2029, 0x202a, 0x203e, 0x203f,
 309     0x2040, 0x20dd, 0x20de, 0x20df, 0x20e0, 0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x0000
 310 };
 311
 312 static UnicodeString* cannedTestChars = 0;
 313
 314 #define  halfNA     "\\u0928\\u094d\\u200d"
 315 #define  halfSA     "\\u0938\\u094d\\u200d"
 316 #define  halfCHA    "\\u091a\\u094d\\u200d"
 317 #define  halfKA     "\\u0915\\u094d\\u200d"
 318 #define  deadTA     "\\u0924\\u094d"
 319
 320 //--------------------------------------------------------------------------------------
 321 //
 322 //    RBBITest    constructor and destructor
 323 //
 324 //--------------------------------------------------------------------------------------
 325
 326 RBBITest::RBBITest() {
 327     UnicodeString temp(cannedTestArray);
 328     cannedTestChars = new UnicodeString();
 329     *cannedTestChars += (UChar)0x0000;
 330     *cannedTestChars += temp;
 331 }
 332
 333
 334 RBBITest::~RBBITest() {
 335     delete cannedTestChars;
 336 }
 337
 338
 339 static const int T_NUMBER = 100;
 340 static const int T_LETTER = 200;
 341 static const int T_H_OR_K = 300;
 342 static const int T_IDEO   = 400;
 343
 344
 345
 346
 347
 348
 349 //--------------------------------------------------------------------
 350 //Testing the BreakIterator for devanagari script
 351 //--------------------------------------------------------------------
 352
 353 #define deadRA   "\\u0930\\u094d"         /*deadform RA = devanagari RA + virama*/
 354 #define deadPHA  "\\u092b\\u094d"         /*deadform PHA = devanagari PHA + virama*/
 355 #define deadTTHA "\\u0920\\u094d"
 356 #define deadPA   "\\u092a\\u094d"
 357 #define deadSA   "\\u0938\\u094d"
 358 #define visarga  "\\u0903"                /*devanagari visarga looks like a english colon*/
 359
 360
 361
 362
 363
 364
 365 //-----------------------------------------------------------------------------------
 366 //
 367 //   Test for status {tag} return value from break rules.
 368 //        TODO:  a more thorough test.
 369 //
 370 //-----------------------------------------------------------------------------------
 371 void RBBITest::TestStatusReturn() {
 372      UnicodeString rulesString1("$Letters = [:L:];\n"
 373                                   "$Numbers = [:N:];\n"
 374                                   "$Letters+{1};\n"
 375                                   "$Numbers+{2};\n"
 376                                   "Help\\ {4}/me\\!;\n"
 377                                   "[^$Letters $Numbers];\n"
 378                                   "!.*;\n", -1, US_INV);
 379      UnicodeString testString1  = "abc123..abc Help me Help me!";
 380                                 // 01234567890123456789012345678
 381      int32_t bounds1[]   = {0, 3, 6, 7, 8, 11, 12, 16, 17, 19, 20, 25, 27, 28, -1};
 382      int32_t brkStatus[] = {0, 1, 2, 0, 0,  1,  0,  1,  0,  1,  0,  4,  1,  0, -1};
 383
 384      UErrorCode status=U_ZERO_ERROR;
 385      UParseError    parseError;
 386
 387      RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(rulesString1, parseError, status);
 388      if(U_FAILURE(status)) {
 389          dataerrln("FAIL : in construction - %s", u_errorName(status));
 390      } else {
 391          int32_t  pos;
 392          int32_t  i = 0;
 393          bi->setText(testString1);
 394          for (pos=bi->first(); pos!= BreakIterator::DONE; pos=bi->next()) {
 395              if (pos != bounds1[i]) {
 396                  errln("FAIL:  expected break at %d, got %d\n", bounds1[i], pos);
 397                  break;
 398              }
 399
 400              int tag = bi->getRuleStatus();
 401              if (tag != brkStatus[i]) {
 402                  errln("FAIL:  break at %d, expected tag %d, got tag %d\n", pos, brkStatus[i], tag);
 403                  break;
 404              }
 405              i++;
 406          }
 407      }
 408      delete bi;
 409 }
 410
 411
 412 static void printStringBreaks(UnicodeString ustr, int expected[],
 413                               int expectedcount)
 414 {
 415     UErrorCode status = U_ZERO_ERROR;
 416     char name[100];
 417     printf("code    alpha extend alphanum type word sent line name\n");
 418     int j;
 419     for (j = 0; j < ustr.length(); j ++) {
 420         if (expectedcount > 0) {
 421             int k;
 422             for (k = 0; k < expectedcount; k ++) {
 423                 if (j == expected[k]) {
 424                     printf("------------------------------------------------ %d\n",
 425                            j);
 426                 }
 427             }
 428         }
 429         UChar32 c = ustr.char32At(j);
 430         if (c > 0xffff) {
 431             j ++;
 432         }
 433         u_charName(c, U_UNICODE_CHAR_NAME, name, 100, &status);
 434         printf("%7x %5d %6d %8d %4s %4s %4s %4s %s\n", (int)c,
 435                            u_isUAlphabetic(c),
 436                            u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND),
 437                            u_isalnum(c),
 438                            u_getPropertyValueName(UCHAR_GENERAL_CATEGORY,
 439                                                   u_charType(c),
 440                                                   U_SHORT_PROPERTY_NAME),
 441                            u_getPropertyValueName(UCHAR_WORD_BREAK,
 442                                                   u_getIntPropertyValue(c,
 443                                                           UCHAR_WORD_BREAK),
 444                                                   U_SHORT_PROPERTY_NAME),
 445                            u_getPropertyValueName(UCHAR_SENTENCE_BREAK,
 446                                    u_getIntPropertyValue(c,
 447                                            UCHAR_SENTENCE_BREAK),
 448                                    U_SHORT_PROPERTY_NAME),
 449                            u_getPropertyValueName(UCHAR_LINE_BREAK,
 450                                    u_getIntPropertyValue(c,
 451                                            UCHAR_LINE_BREAK),
 452                                    U_SHORT_PROPERTY_NAME),
 453                            name);
 454     }
 455 }
 456
 457 void RBBITest::TestThaiLineBreak() {
 458     UErrorCode status = U_ZERO_ERROR;
 459     BITestData thaiLineSelection(status);
 460
 461     // \u0e2f-- the Thai paiyannoi character-- isn't a letter.  It's a symbol that
 462     // represents elided letters at the end of a long word.  It should be bound to
 463     // the end of the word and not treated as an independent punctuation mark.
 464
 465
 466     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 467     ADD_DATACHUNK(thaiLineSelection, "\\u0e2a\\u0e16\\u0e32\\u0e19\\u0e35\\u0e2f", 0, status);
 468     ADD_DATACHUNK(thaiLineSelection, "\\u0e08\\u0e30", 0, status);
 469     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e14\\u0e21", 0, status);
 470     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e08\\u0e49\\u0e32", 0, status);
 471 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32", 0, status);
 472 //        ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 473     ADD_DATACHUNK(thaiLineSelection, "\\u0e2b\\u0e19\\u0e49\\u0e32\\u0e17\\u0e35\\u0e48", 0, status);
 474     // the commented-out lines (I think) are the preferred result; this line is what our current dictionary is giving us
 475     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e2d\\u0e01", 0, status);
 476     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32", 0, status);
 477     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e23\\u0e48\\u0e07", 0, status);
 478     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e1a\\u0e32\\u0e22", 0, status);
 479     ADD_DATACHUNK(thaiLineSelection, "\\u0e2d\\u0e22\\u0e48\\u0e32\\u0e07", 0, status);
 480     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e15\\u0e47\\u0e21", 0, status);
 481
 482     // the one time where the paiyannoi occurs somewhere other than at the end
 483     // of a word is in the Thai abbrevation for "etc.", which both begins and
 484     // ends with a paiyannoi
 485     ADD_DATACHUNK(thaiLineSelection, "\\u0e2f\\u0e25\\u0e2f", 0, status);
 486     ADD_DATACHUNK(thaiLineSelection, "\\u0e17\\u0e35\\u0e48", 0, status);
 487     ADD_DATACHUNK(thaiLineSelection, "\\u0e19\\u0e31\\u0e49\\u0e19", 0, status);
 488
 489     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 490         Locale("th"), status);
 491     if (U_FAILURE(status))
 492     {
 493         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestThaiLineBreak. - %s", u_errorName(status));
 494         return;
 495     }
 496
 497     generalIteratorTest(*e, thaiLineSelection);
 498     delete e;
 499 }
 500
 501
 502
 503 void RBBITest::TestMixedThaiLineBreak()
 504 {
 505     UErrorCode   status = U_ZERO_ERROR;
 506     BITestData   thaiLineSelection(status);
 507
 508     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 509
 510
 511     // @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters
 512     // start
 513
 514     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 515     ADD_DATACHUNK(thaiLineSelection, "\\u0E1E\\u0E38\\u0E17\\u0E18\\u0E28\\u0E31\\u0E01\\u0E23\\u0E32\\u0E0A ", 0, status);
 516     ADD_DATACHUNK(thaiLineSelection, "2545 ", 0, status);
 517     ADD_DATACHUNK(thaiLineSelection, "\\u0E40\\u0E1B\\u0E47\\u0E19", 0, status);
 518     ADD_DATACHUNK(thaiLineSelection, "\\u0E1B\\u0E35", 0, status);
 519     ADD_DATACHUNK(thaiLineSelection, "\\u0E09\\u0E25\\u0E2D\\u0E07", 0, status);
 520     ADD_DATACHUNK(thaiLineSelection, "\\u0E04\\u0E23\\u0E1A", 0, status);
 521     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E2D\\u0E1A ", 0, status);
 522     ADD_DATACHUNK(thaiLineSelection, "\"\\u0E52\\u0E52\\u0E50 ", 0, status);
 523     ADD_DATACHUNK(thaiLineSelection, "\\u0E1b\\u0E35\" ", 0, status);
 524     ADD_DATACHUNK(thaiLineSelection, "\\u0E02\\u0E2d\\u0E07", 0, status);
 525     ADD_DATACHUNK(thaiLineSelection, "\\u0E01\\u0E23\\u0E38\\u0E07", 0, status);
 526     ADD_DATACHUNK(thaiLineSelection, "\\u0E23\\u0E31\\u0E15\\u0E19\\u0E42\\u0E01\\u0E2A\\u0E34\\u0E19\\u0E17\\u0E23\\u0E4C ", 0, status);
 527     ADD_DATACHUNK(thaiLineSelection, "(\\u0E01\\u0E23\\u0E38\\u0E07\\u0E40\\u0E17\\u0E1e\\u0E2F", 0, status);
 528     ADD_DATACHUNK(thaiLineSelection, "\\u0E2B\\u0E23\\u0E37\\u0E2D ", 0, status);
 529     ADD_DATACHUNK(thaiLineSelection, "Bangkok)", 0, status);
 530
 531     // @suwit - end of changes
 532
 533
 534     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale("th"), status);
 535     if (U_FAILURE(status))
 536     {
 537         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMixedThaiLineBreak. - %s", u_errorName(status));
 538         return;
 539     }
 540
 541
 542     generalIteratorTest(*e, thaiLineSelection);
 543     delete e;
 544 }
 545
 546
 547 void RBBITest::TestMaiyamok()
 548 {
 549     UErrorCode status = U_ZERO_ERROR;
 550     BITestData   thaiLineSelection(status);
 551     ADD_DATACHUNK(thaiLineSelection, NULL, 0, status);           // Break at start of data
 552     // the Thai maiyamok character is a shorthand symbol that means "repeat the previous
 553     // word".  Instead of appearing as a word unto itself, however, it's kept together
 554     // with the word before it
 555     ADD_DATACHUNK(thaiLineSelection, "\\u0e44\\u0e1b\\u0e46", 0, status);
 556     ADD_DATACHUNK(thaiLineSelection, "\\u0e21\\u0e32\\u0e46", 0, status);
 557     ADD_DATACHUNK(thaiLineSelection, "\\u0e23\\u0e30\\u0e2b\\u0e27\\u0e48\\u0e32\\u0e07", 0, status);
 558     ADD_DATACHUNK(thaiLineSelection, "\\u0e01\\u0e23\\u0e38\\u0e07", 0, status);
 559     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e17\\u0e1e", 0, status);
 560     ADD_DATACHUNK(thaiLineSelection, "\\u0e41\\u0e25\\u0e30", 0, status);
 561     ADD_DATACHUNK(thaiLineSelection, "\\u0e40\\u0e03\\u0e35", 0, status);
 562     ADD_DATACHUNK(thaiLineSelection, "\\u0e22\\u0e07", 0, status);
 563     ADD_DATACHUNK(thaiLineSelection, "\\u0e43\\u0e2b\\u0e21\\u0e48", 0, status);
 564
 565     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(
 566         Locale("th"), status);
 567
 568     if (U_FAILURE(status))
 569     {
 570         errcheckln(status, "Failed to create the BreakIterator for Thai locale in TestMaiyamok. - %s", u_errorName(status));
 571         return;
 572     }
 573     generalIteratorTest(*e, thaiLineSelection);
 574     delete e;
 575 }
 576
 577
 578
 579 void RBBITest::TestBug3818() {
 580     UErrorCode  status = U_ZERO_ERROR;
 581
 582     // Four Thai words...
 583     static const UChar thaiWordData[] = {  0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48,
 584                                            0x0E43,0x0E2B,0x0E0D,0x0E48, 0x0E43,0x0E2B,0x0E0D,0x0E48, 0 };
 585     UnicodeString  thaiStr(thaiWordData);
 586
 587     RuleBasedBreakIterator* bi =
 588         (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale("th"), status);
 589     if (U_FAILURE(status) || bi == NULL) {
 590         errcheckln(status, "Fail at file %s, line %d, status = %s", __FILE__, __LINE__, u_errorName(status));
 591         return;
 592     }
 593     bi->setText(thaiStr);
 594
 595     int32_t  startOfSecondWord = bi->following(1);
 596     if (startOfSecondWord != 4) {
 597         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 598             __FILE__, __LINE__, startOfSecondWord);
 599     }
 600     startOfSecondWord = bi->following(0);
 601     if (startOfSecondWord != 4) {
 602         errln("Fail at file %s, line %d expected start of word at 4, got %d",
 603             __FILE__, __LINE__, startOfSecondWord);
 604     }
 605     delete bi;
 606 }
 607
 608
 609 void RBBITest::TestJapaneseWordBreak() {
 610     UErrorCode status = U_ZERO_ERROR;
 611     BITestData   japaneseWordSelection(status);
 612
 613     ADD_DATACHUNK(japaneseWordSelection, NULL, 0, status);           // Break at start of data
 614     ADD_DATACHUNK(japaneseWordSelection, "\\u4ECA\\u65E5", 400, status); //2
 615     ADD_DATACHUNK(japaneseWordSelection, "\\u306F\\u3044\\u3044", 300, status); //5
 616     ADD_DATACHUNK(japaneseWordSelection, "\\u5929\\u6C17", 400, status); //7
 617     ADD_DATACHUNK(japaneseWordSelection, "\\u3067\\u3059\\u306D", 300, status); //10
 618     ADD_DATACHUNK(japaneseWordSelection, "\\u3002", 0, status); //11
 619     ADD_DATACHUNK(japaneseWordSelection, "\\u000D\\u000A", 0, status); //12
 620
 621     RuleBasedBreakIterator* e = (RuleBasedBreakIterator *)BreakIterator::createWordInstance(
 622         Locale("ja"), status);
 623     if (U_FAILURE(status))
 624     {
 625         errcheckln(status, "Failed to create the BreakIterator for Japanese locale in TestJapaneseWordBreak.\n");
 626         return;
 627     }
 628
 629     generalIteratorTest(*e, japaneseWordSelection);
 630     delete e;
 631 }
 632
 633 void RBBITest::TestTrieDict() {
 634     UErrorCode      status  = U_ZERO_ERROR;
 635
 636     //
 637     //  Open and read the test data file.
 638     //
 639     const char *testDataDirectory = IntlTest::getSourceTestData(status);
 640     char testFileName[1000];
 641     if (testDataDirectory == NULL || strlen(testDataDirectory) + strlen("riwords.txt") + 10 >= sizeof(testFileName)) {
 642         errln("Can't open test data.  Path too long.");
 643         return;
 644     }
 645     strcpy(testFileName, testDataDirectory);
 646     strcat(testFileName, "riwords.txt");
 647
 648     // Items needing deleting at the end
 649     MutableTrieDictionary *mutableDict = NULL;
 650     CompactTrieDictionary *compactDict = NULL;
 651     UnicodeSet            *breaks      = NULL;
 652     UChar                 *testFile    = NULL;
 653     StringEnumeration     *enumer1     = NULL;
 654     StringEnumeration     *enumer2     = NULL;
 655     MutableTrieDictionary *mutable2    = NULL;
 656     StringEnumeration     *cloneEnum   = NULL;
 657     CompactTrieDictionary *compact2    = NULL;
 658
 659
 660     const UnicodeString *originalWord = NULL;
 661     const UnicodeString *cloneWord    = NULL;
 662     UChar *current;
 663     UChar *word;
 664     UChar uc;
 665     int32_t wordLen;
 666     int32_t wordCount;
 667     int32_t testCount;
 668
 669     int    len;
 670     testFile = ReadAndConvertFile(testFileName, len, NULL, status);
 671     if (U_FAILURE(status)) {
 672         goto cleanup; /* something went wrong, error already output */
 673     }
 674
 675     mutableDict = new MutableTrieDictionary(0x0E1C, status);
 676     if (U_FAILURE(status)) {
 677         errln("Error creating MutableTrieDictionary: %s\n", u_errorName(status));
 678         goto cleanup;
 679     }
 680
 681     breaks = new UnicodeSet;
 682     breaks->add(0x000A);     // Line Feed
 683     breaks->add(0x000D);     // Carriage Return
 684     breaks->add(0x2028);     // Line Separator
 685     breaks->add(0x2029);     // Paragraph Separator
 686
 687     // Now add each non-comment line of the file as a word.
 688     current = testFile;
 689     word = current;
 690     uc = *current++;
 691     wordLen = 0;
 692     wordCount = 0;
 693
 694     while (uc) {
 695         if (uc == 0x0023) {     // #comment line, skip
 696             while (uc && !breaks->contains(uc)) {
 697                 uc = *current++;
 698             }
 699         }
 700         else while (uc && !breaks->contains(uc)) {
 701             ++wordLen;
 702             uc = *current++;
 703         }
 704         if (wordLen > 0) {
 705             mutableDict->addWord(word, wordLen, status);
 706             if (U_FAILURE(status)) {
 707                 errln("Could not add word to mutable dictionary; status %s\n", u_errorName(status));
 708                 goto cleanup;
 709             }
 710             wordCount += 1;
 711         }
 712
 713         // Find beginning of next line
 714         while (uc && breaks->contains(uc)) {
 715             uc = *current++;
 716         }
 717         word = current-1;
 718         wordLen = 0;
 719     }
 720
 721     if (wordCount < 50) {
 722         errln("Word count (%d) unreasonably small\n", wordCount);
 723         goto cleanup;
 724     }
 725
 726     enumer1 = mutableDict->openWords(status);
 727     if (U_FAILURE(status)) {
 728         errln("Could not open mutable dictionary enumerator: %s\n", u_errorName(status));
 729         goto cleanup;
 730     }
 731
 732     testCount = 0;
 733     if (wordCount != (testCount = enumer1->count(status))) {
 734         errln("MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 735             testCount, wordCount, u_errorName(status));
 736         goto cleanup;
 737     }
 738
 739     // Now compact it
 740     compactDict = new CompactTrieDictionary(*mutableDict, status);
 741     if (U_FAILURE(status)) {
 742         errln("Failed to create CompactTrieDictionary: %s\n", u_errorName(status));
 743         goto cleanup;
 744     }
 745
 746     enumer2 = compactDict->openWords(status);
 747     if (U_FAILURE(status)) {
 748         errln("Could not open compact trie dictionary enumerator: %s\n", u_errorName(status));
 749         goto cleanup;
 750     }
 751
 752     if (wordCount != (testCount = enumer2->count(status))) {
 753         errln("CompactTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 754             testCount, wordCount, u_errorName(status));
 755         goto cleanup;
 756     }
 757
 758     if (typeid(*enumer1) == typeid(*enumer2)) {
 759         errln("CompactTrieEnumeration and MutableTrieEnumeration typeids are the same");
 760     }
 761     delete enumer1;
 762     enumer1 = NULL;
 763     delete enumer2;
 764     enumer2 = NULL;
 765
 766     // Now un-compact it
 767     mutable2 = compactDict->cloneMutable(status);
 768     if (U_FAILURE(status)) {
 769         errln("Could not clone CompactTrieDictionary to MutableTrieDictionary: %s\n", u_errorName(status));
 770         goto cleanup;
 771     }
 772
 773     cloneEnum = mutable2->openWords(status);
 774     if (U_FAILURE(status)) {
 775         errln("Could not create cloned mutable enumerator: %s\n", u_errorName(status));
 776         goto cleanup;
 777     }
 778
 779     if (wordCount != (testCount = cloneEnum->count(status))) {
 780         errln("Cloned MutableTrieDictionary word count (%d) differs from file word count (%d), with status %s\n",
 781             testCount, wordCount, u_errorName(status));
 782         goto cleanup;
 783     }
 784
 785     // Compact original dictionary to clone. Note that we can only compare the same kind of
 786     // dictionary as the order of the enumerators is not guaranteed to be the same between
 787     // different kinds
 788     enumer1 = mutableDict->openWords(status);
 789     if (U_FAILURE(status)) {
 790         errln("Could not re-open mutable dictionary enumerator: %s\n", u_errorName(status));
 791         goto cleanup;
 792      }
 793
 794     originalWord = enumer1->snext(status);
 795     cloneWord = cloneEnum->snext(status);
 796     while (U_SUCCESS(status) && originalWord != NULL && cloneWord != NULL) {
 797         if (*originalWord != *cloneWord) {
 798             errln("Original and cloned MutableTrieDictionary word mismatch\n");
 799             goto cleanup;
 800         }
 801         originalWord = enumer1->snext(status);
 802         cloneWord = cloneEnum->snext(status);
 803     }
 804
 805     if (U_FAILURE(status)) {
 806         errln("Enumeration failed: %s\n", u_errorName(status));
 807         goto cleanup;
 808     }
 809
 810     if (originalWord != cloneWord) {
 811         errln("Original and cloned MutableTrieDictionary ended enumeration at different points\n");
 812         goto cleanup;
 813     }
 814
 815     // Test the data copying constructor for CompactTrieDict, and the data access APIs.
 816     compact2 = new CompactTrieDictionary(compactDict->data(), status);
 817     if (U_FAILURE(status)) {
 818         errln("CompactTrieDictionary(const void *,...) failed\n");
 819         goto cleanup;
 820     }
 821
 822     if (compact2->dataSize() == 0) {
 823         errln("CompactTrieDictionary->dataSize() == 0\n");
 824         goto cleanup;
 825     }
 826
 827     // Now count the words via the second dictionary
 828     delete enumer1;
 829     enumer1 = compact2->openWords(status);
 830     if (U_FAILURE(status)) {
 831         errln("Could not open compact trie dictionary 2 enumerator: %s\n", u_errorName(status));
 832         goto cleanup;
 833     }
 834
 835     if (wordCount != (testCount = enumer1->count(status))) {
 836         errln("CompactTrieDictionary 2 word count (%d) differs from file word count (%d), with status %s\n",
 837             testCount, wordCount, u_errorName(status));
 838         goto cleanup;
 839     }
 840
 841 cleanup:
 842     delete compactDict;
 843     delete mutableDict;
 844     delete breaks;
 845     delete[] testFile;
 846     delete enumer1;
 847     delete mutable2;
 848     delete cloneEnum;
 849     delete compact2;
 850 }
 851
 852
 853 //----------------------------------------------------------------------------
 854 //
 855 // generalIteratorTest      Given a break iterator and a set of test data,
 856 //                          Run the tests and report the results.
 857 //
 858 //----------------------------------------------------------------------------
 859 void RBBITest::generalIteratorTest(RuleBasedBreakIterator& bi, BITestData &td)
 860 {
 861
 862     bi.setText(td.fDataToBreak);
 863
 864     testFirstAndNext(bi, td);
 865
 866     testLastAndPrevious(bi, td);
 867
 868     testFollowing(bi, td);
 869     testPreceding(bi, td);
 870     testIsBoundary(bi, td);
 871     doMultipleSelectionTest(bi, td);
 872 }
 873
 874
 875 //
 876 //   testFirstAndNext.   Run the iterator forwards in the obvious first(), next()
 877 //                       kind of loop.
 878 //
 879 void RBBITest::testFirstAndNext(RuleBasedBreakIterator& bi, BITestData &td)
 880 {
 881     UErrorCode  status = U_ZERO_ERROR;
 882     int32_t     p;
 883     int32_t     lastP = -1;
 884     int32_t     tag;
 885
 886     logln("Test first and next");
 887     bi.setText(td.fDataToBreak);
 888     td.clearResults();
 889
 890     for (p=bi.first(); p!=RuleBasedBreakIterator::DONE; p=bi.next()) {
 891         td.fActualBreakPositions.addElement(p, status);  // Save result.
 892         tag = bi.getRuleStatus();
 893         td.fActualTags.addElement(tag, status);
 894         if (p <= lastP) {
 895             // If the iterator is not making forward progress, stop.
 896             //  No need to raise an error here, it'll be detected in the normal check of results.
 897             break;
 898         }
 899         lastP = p;
 900     }
 901     td.checkResults("testFirstAndNext", this);
 902 }
 903
 904
 905 //
 906 //  TestLastAndPrevious.   Run the iterator backwards, starting with last().
 907 //
 908 void  RBBITest::testLastAndPrevious(RuleBasedBreakIterator& bi,  BITestData &td)
 909 {
 910     UErrorCode  status = U_ZERO_ERROR;
 911     int32_t     p;
 912     int32_t     lastP  = 0x7ffffffe;
 913     int32_t     tag;
 914
 915     logln("Test last and previous");
 916     bi.setText(td.fDataToBreak);
 917     td.clearResults();
 918
 919     for (p=bi.last(); p!=RuleBasedBreakIterator::DONE; p=bi.previous()) {
 920         // Save break position.  Insert it at start of vector of results, shoving
 921         //    already-saved results further towards the end.
 922         td.fActualBreakPositions.insertElementAt(p, 0, status);
 923         // bi.previous();   // TODO:  Why does this fix things up????
 924         // bi.next();
 925         tag = bi.getRuleStatus();
 926         td.fActualTags.insertElementAt(tag, 0, status);
 927         if (p >= lastP) {
 928             // If the iterator is not making progress, stop.
 929             //  No need to raise an error here, it'll be detected in the normal check of results.
 930             break;
 931         }
 932         lastP = p;
 933     }
 934     td.checkResults("testLastAndPrevious", this);
 935 }
 936
 937
 938 void RBBITest::testFollowing(RuleBasedBreakIterator& bi, BITestData &td)
 939 {
 940     UErrorCode  status = U_ZERO_ERROR;
 941     int32_t     p;
 942     int32_t     tag;
 943     int32_t     lastP  = -2;     // A value that will never be returned as a break position.
 944                                  //   cannot be -1; that is returned for DONE.
 945     int         i;
 946
 947     logln("testFollowing():");
 948     bi.setText(td.fDataToBreak);
 949     td.clearResults();
 950
 951     // Save the starting point, since we won't get that out of following.
 952     p = bi.first();
 953     td.fActualBreakPositions.addElement(p, status);  // Save result.
 954     tag = bi.getRuleStatus();
 955     td.fActualTags.addElement(tag, status);
 956
 957     for (i = 0; i <= td.fDataToBreak.length()+1; i++) {
 958         p = bi.following(i);
 959         if (p != lastP) {
 960             if (p == RuleBasedBreakIterator::DONE) {
 961                 break;
 962             }
 963             // We've reached a new break position.  Save it.
 964             td.fActualBreakPositions.addElement(p, status);  // Save result.
 965             tag = bi.getRuleStatus();
 966             td.fActualTags.addElement(tag, status);
 967             lastP = p;
 968         }
 969     }
 970     // The loop normally exits by means of the break in the middle.
 971     // Make sure that the index was at the correct position for the break iterator to have
 972     //   returned DONE.
 973     if (i != td.fDataToBreak.length()) {
 974         errln("testFollowing():  iterator returned DONE prematurely.");
 975     }
 976
 977     // Full check of all results.
 978     td.checkResults("testFollowing", this);
 979 }
 980
 981
 982
 983 void RBBITest::testPreceding(RuleBasedBreakIterator& bi,  BITestData &td) {
 984     UErrorCode  status = U_ZERO_ERROR;
 985     int32_t     p;
 986     int32_t     tag;
 987     int32_t     lastP  = 0x7ffffffe;
 988     int         i;
 989
 990     logln("testPreceding():");
 991     bi.setText(td.fDataToBreak);
 992     td.clearResults();
 993
 994     p = bi.last();
 995     td.fActualBreakPositions.addElement(p, status);
 996     tag = bi.getRuleStatus();
 997     td.fActualTags.addElement(tag, status);
 998
 999     for (i = td.fDataToBreak.length(); i>=-1; i--) {
1000         p = bi.preceding(i);
1001         if (p != lastP) {
1002             if (p == RuleBasedBreakIterator::DONE) {
1003                 break;
1004             }
1005             // We've reached a new break position.  Save it.
1006             td.fActualBreakPositions.insertElementAt(p, 0, status);
1007             lastP = p;
1008             tag = bi.getRuleStatus();
1009             td.fActualTags.insertElementAt(tag, 0, status);
1010         }
1011     }
1012     // The loop normally exits by means of the break in the middle.
1013     // Make sure that the index was at the correct position for the break iterator to have
1014     //   returned DONE.
1015     if (i != 0) {
1016         errln("testPreceding():  iterator returned DONE prematurely.");
1017     }
1018
1019     // Full check of all results.
1020     td.checkResults("testPreceding", this);
1021 }
1022
1023
1024
1025 void RBBITest::testIsBoundary(RuleBasedBreakIterator& bi,  BITestData &td) {
1026     UErrorCode  status = U_ZERO_ERROR;
1027     int         i;
1028     int32_t     tag;
1029
1030     logln("testIsBoundary():");
1031     bi.setText(td.fDataToBreak);
1032     td.clearResults();
1033
1034     for (i = 0; i <= td.fDataToBreak.length(); i++) {
1035         if (bi.isBoundary(i)) {
1036             td.fActualBreakPositions.addElement(i, status);  // Save result.
1037             tag = bi.getRuleStatus();
1038             td.fActualTags.addElement(tag, status);
1039         }
1040     }
1041     td.checkResults("testIsBoundary: ", this);
1042 }
1043
1044
1045
1046 void RBBITest::doMultipleSelectionTest(RuleBasedBreakIterator& iterator, BITestData &td)
1047 {
1048     iterator.setText(td.fDataToBreak);
1049
1050     RuleBasedBreakIterator* testIterator =(RuleBasedBreakIterator*)iterator.clone();
1051     int32_t offset = iterator.first();
1052     int32_t testOffset;
1053     int32_t count = 0;
1054
1055     logln("doMultipleSelectionTest text of length: %d", td.fDataToBreak.length());
1056
1057     if (*testIterator != iterator)
1058         errln("clone() or operator!= failed: two clones compared unequal");
1059
1060     do {
1061         testOffset = testIterator->first();
1062         testOffset = testIterator->next(count);
1063         if (offset != testOffset)
1064             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1065
1066         if (offset != RuleBasedBreakIterator::DONE) {
1067             count++;
1068             offset = iterator.next();
1069
1070             if (offset != RuleBasedBreakIterator::DONE && *testIterator == iterator) {
1071                 errln("operator== failed: Two unequal iterators compared equal. count=%d offset=%d", count, offset);
1072                 if (count > 10000 || offset == -1) {
1073                     errln("operator== failed too many times. Stopping test.");
1074                     if (offset == -1) {
1075                         errln("Does (RuleBasedBreakIterator::DONE == -1)?");
1076                     }
1077                     return;
1078                 }
1079             }
1080         }
1081     } while (offset != RuleBasedBreakIterator::DONE);
1082
1083     // now do it backwards...
1084     offset = iterator.last();
1085     count = 0;
1086
1087     do {
1088         testOffset = testIterator->last();
1089         testOffset = testIterator->next(count);   // next() with a negative arg is same as previous
1090         if (offset != testOffset)
1091             errln(UnicodeString("next(n) and next() not returning consistent results: for step ") + count + ", next(n) returned " + testOffset + " and next() had " + offset);
1092
1093         if (offset != RuleBasedBreakIterator::DONE) {
1094             count--;
1095             offset = iterator.previous();
1096         }
1097     } while (offset != RuleBasedBreakIterator::DONE);
1098
1099     delete testIterator;
1100 }
1101
1102
1103 //---------------------------------------------
1104 //
1105 //     other tests
1106 //
1107 //---------------------------------------------
1108 void RBBITest::TestEmptyString()
1109 {
1110     UnicodeString text = "";
1111     UErrorCode status = U_ZERO_ERROR;
1112
1113     BITestData x(status);
1114     ADD_DATACHUNK(x, "", 0, status);           // Break at start of data
1115     RuleBasedBreakIterator* bi = (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
1116     if (U_FAILURE(status))
1117     {
1118         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEmptyString. - %s", u_errorName(status));
1119         return;
1120     }
1121     generalIteratorTest(*bi, x);
1122     delete bi;
1123 }
1124
1125 void RBBITest::TestGetAvailableLocales()
1126 {
1127     int32_t locCount = 0;
1128     const Locale* locList = BreakIterator::getAvailableLocales(locCount);
1129
1130     if (locCount == 0)
1131         dataerrln("getAvailableLocales() returned an empty list!");
1132     // Just make sure that it's returning good memory.
1133     int32_t i;
1134     for (i = 0; i < locCount; ++i) {
1135         logln(locList[i].getName());
1136     }
1137 }
1138
1139 //Testing the BreakIterator::getDisplayName() function
1140 void RBBITest::TestGetDisplayName()
1141 {
1142     UnicodeString   result;
1143
1144     BreakIterator::getDisplayName(Locale::getUS(), result);
1145     if (Locale::getDefault() == Locale::getUS() && result != "English (United States)")
1146         dataerrln("BreakIterator::getDisplayName() failed: expected \"English (United States)\", got \""
1147                 + result);
1148
1149     BreakIterator::getDisplayName(Locale::getFrance(), Locale::getUS(), result);
1150     if (result != "French (France)")
1151         dataerrln("BreakIterator::getDisplayName() failed: expected \"French (France)\", got \""
1152                 + result);
1153 }
1154 /**
1155  * Test End Behaviour
1156  * @bug 4068137
1157  */
1158 void RBBITest::TestEndBehaviour()
1159 {
1160     UErrorCode status = U_ZERO_ERROR;
1161     UnicodeString testString("boo.");
1162     BreakIterator *wb = BreakIterator::createWordInstance(Locale::getDefault(), status);
1163     if (U_FAILURE(status))
1164     {
1165         errcheckln(status, "Failed to create the BreakIterator for default locale in TestEndBehaviour. - %s", u_errorName(status));
1166         return;
1167     }
1168     wb->setText(testString);
1169
1170     if (wb->first() != 0)
1171         errln("Didn't get break at beginning of string.");
1172     if (wb->next() != 3)
1173         errln("Didn't get break before period in \"boo.\"");
1174     if (wb->current() != 4 && wb->next() != 4)
1175         errln("Didn't get break at end of string.");
1176     delete wb;
1177 }
1178 /*
1179  * @bug 4153072
1180  */
1181 void RBBITest::TestBug4153072() {
1182     UErrorCode status = U_ZERO_ERROR;
1183     BreakIterator *iter = BreakIterator::createWordInstance(Locale::getDefault(), status);
1184     if (U_FAILURE(status))
1185     {
1186         errcheckln(status, "Failed to create the BreakIterator for default locale in TestBug4153072 - %s", u_errorName(status));
1187         return;
1188     }
1189     UnicodeString str("...Hello, World!...");
1190     int32_t begin = 3;
1191     int32_t end = str.length() - 3;
1192     UBool onBoundary;
1193
1194     StringCharacterIterator* textIterator = new StringCharacterIterator(str, begin, end, begin);
1195     iter->adoptText(textIterator);
1196     int index;
1197     // Note: with the switch to UText, there is no way to restrict the
1198     //       iteration range to begin at an index other than zero.
1199     //       String character iterators created with a non-zero bound are
1200     //         treated by RBBI as being empty.
1201     for (index = -1; index < begin + 1; ++index) {
1202         onBoundary = iter->isBoundary(index);
1203         if (index == 0?  !onBoundary : onBoundary) {
1204             errln((UnicodeString)"Didn't handle isBoundary correctly with offset = " + index +
1205                             " and begin index = " + begin);
1206         }
1207     }
1208     delete iter;
1209 }
1210
1211
1212 //
1213 // Test for problem reported by Ashok Matoria on 9 July 2007
1214 //    One.<kSoftHyphen><kSpace>Two.
1215 //
1216 //    Sentence break at start (0) and then on calling next() it breaks at
1217 //   'T' of "Two". Now, at this point if I do next() and
1218 //    then previous(), it breaks at <kSOftHyphen> instead of 'T' of "Two".
1219 //
1220 void RBBITest::TestBug5775() {
1221     UErrorCode status = U_ZERO_ERROR;
1222     BreakIterator *bi = BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
1223     TEST_ASSERT_SUCCESS(status);
1224     if (U_FAILURE(status)) {
1225         return;
1226     }
1227 // Check for status first for better handling of no data errors.
1228     TEST_ASSERT(bi != NULL);
1229     if (bi == NULL) {
1230         return;
1231     }
1232
1233     UnicodeString s("One.\\u00ad Two.", -1, US_INV);
1234     //               01234      56789
1235     s = s.unescape();
1236     bi->setText(s);
1237     int pos = bi->next();
1238     TEST_ASSERT(pos == 6);
1239     pos = bi->next();
1240     TEST_ASSERT(pos == 10);
1241     pos = bi->previous();
1242     TEST_ASSERT(pos == 6);
1243     delete bi;
1244 }
1245
1246
1247
1248 /**
1249  * Test Japanese Line Break
1250  * @bug 4095322
1251  */
1252 void RBBITest::TestJapaneseLineBreak()
1253 {
1254 #if 0
1255     // Test needs updating some more...   Dump it for now.
1256
1257
1258     // Change for Unicode TR 14:  Punctuation characters with categories Pi and Pf do not count
1259     //        as opening and closing punctuation for line breaking.
1260     //        Also, \u30fc and \u30fe are not counted as hyphens.   Remove these chars
1261     //        from these tests.    6-13-2002
1262     //
1263     UErrorCode status = U_ZERO_ERROR;
1264     UnicodeString testString = CharsToUnicodeString("\\u4e00x\\u4e8c");
1265     UnicodeString precedingChars = CharsToUnicodeString(
1266         //"([{\\u00ab$\\u00a5\\u00a3\\u00a4\\u2018\\u201a\\u201c\\u201e\\u201b\\u201f");
1267         "([{$\\u00a5\\u00a3\\u00a4\\u201a\\u201e");
1268     UnicodeString followingChars = CharsToUnicodeString(
1269         // ")]}\\u00bb!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7\\u30fc"
1270         ")]}!%,.\\u3001\\u3002\\u3063\\u3083\\u3085\\u3087\\u30c3\\u30e3\\u30e5\\u30e7"
1271         // ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u30fe\\u2019\\u201d\\u00b0\\u2032\\u2033\\u2034"
1272         ":;\\u309b\\u309c\\u3005\\u309d\\u309e\\u30fd\\u00b0\\u2032\\u2033\\u2034"
1273         "\\u2030\\u2031\\u2103\\u2109\\u00a2\\u0300\\u0301\\u0302");
1274     BreakIterator *iter = BreakIterator::createLineInstance(Locale::getJapan(), status);
1275
1276     int32_t i;
1277     if (U_FAILURE(status))
1278     {
1279         errln("Failed to create the BreakIterator for Japanese locale in TestJapaneseLineBreak.\n");
1280         return;
1281     }
1282
1283     for (i = 0; i < precedingChars.length(); i++) {
1284         testString.setCharAt(1, precedingChars[i]);
1285         iter->setText(testString);
1286         int32_t j = iter->first();
1287         if (j != 0)
1288             errln("ja line break failure: failed to start at 0");
1289         j = iter->next();
1290         if (j != 1)
1291             errln("ja line break failure: failed to stop before '" + UCharToUnicodeString(precedingChars[i])
1292                         + "' (" + ((int)(precedingChars[i])) + ")");
1293         j = iter->next();
1294         if (j != 3)
1295             errln("ja line break failure: failed to skip position after '" + UCharToUnicodeString(precedingChars[i])
1296                         + "' (" + ((int)(precedingChars[i])) + ")");
1297     }
1298
1299     for (i = 0; i < followingChars.length(); i++) {
1300         testString.setCharAt(1, followingChars[i]);
1301         iter->setText(testString);
1302         int j = iter->first();
1303         if (j != 0)
1304             errln("ja line break failure: failed to start at 0");
1305         j = iter->next();
1306         if (j != 2)
1307             errln("ja line break failure: failed to skip position before '" + UCharToUnicodeString(followingChars[i])
1308                         + "' (" + ((int)(followingChars[i])) + ")");
1309         j = iter->next();
1310         if (j != 3)
1311             errln("ja line break failure: failed to stop after '" + UCharToUnicodeString(followingChars[i])
1312                         + "' (" + ((int)(followingChars[i])) + ")");
1313     }
1314     delete iter;
1315 #endif
1316 }
1317
1318
1319 //------------------------------------------------------------------------------
1320 //
1321 //   RBBITest::Extended    Run  RBBI Tests from an external test data file
1322 //
1323 //------------------------------------------------------------------------------
1324
1325 struct TestParams {
1326     BreakIterator   *bi;
1327     UnicodeString    dataToBreak;
1328     UVector32       *expectedBreaks;
1329     UVector32       *srcLine;
1330     UVector32       *srcCol;
1331 };
1332
1333 void RBBITest::executeTest(TestParams *t) {
1334     int32_t    bp;
1335     int32_t    prevBP;
1336     int32_t    i;
1337
1338     if (t->bi == NULL) {
1339         return;
1340     }
1341
1342     t->bi->setText(t->dataToBreak);
1343     //
1344     //  Run the iterator forward
1345     //
1346     prevBP = -1;
1347     for (bp = t->bi->first(); bp != BreakIterator::DONE; bp = t->bi->next()) {
1348         if (prevBP ==  bp) {
1349             // Fail for lack of forward progress.
1350             errln("Forward Iteration, no forward progress.  Break Pos=%4d  File line,col=%4d,%4d",
1351                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1352             break;
1353         }
1354
1355         // Check that there were we didn't miss an expected break between the last one
1356         //  and this one.
1357         for (i=prevBP+1; i<bp; i++) {
1358             if (t->expectedBreaks->elementAti(i) != 0) {
1359                 int expected[] = {0, i};
1360                 printStringBreaks(t->dataToBreak, expected, 2);
1361                 errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1362                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1363             }
1364         }
1365
1366         // Check that the break we did find was expected
1367         if (t->expectedBreaks->elementAti(bp) == 0) {
1368             int expected[] = {0, bp};
1369             printStringBreaks(t->dataToBreak, expected, 2);
1370             errln("Forward Iteration, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1371                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1372         } else {
1373             // The break was expected.
1374             //   Check that the {nnn} tag value is correct.
1375             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1376             if (expectedTagVal == -1) {
1377                 expectedTagVal = 0;
1378             }
1379             int32_t line = t->srcLine->elementAti(bp);
1380             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1381             if (rs != expectedTagVal) {
1382                 errln("Incorrect status for forward break.  Pos=%4d  File line,col= %4d,%4d.\n"
1383                       "          Actual, Expected status = %4d, %4d",
1384                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1385             }
1386         }
1387
1388
1389         prevBP = bp;
1390     }
1391
1392     // Verify that there were no missed expected breaks after the last one found
1393     for (i=prevBP+1; i<t->expectedBreaks->size(); i++) {
1394         if (t->expectedBreaks->elementAti(i) != 0) {
1395             errln("Forward Iteration, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1396                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1397         }
1398     }
1399
1400     //
1401     //  Run the iterator backwards, verify that the same breaks are found.
1402     //
1403     prevBP = t->dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
1404     for (bp = t->bi->last(); bp != BreakIterator::DONE; bp = t->bi->previous()) {
1405         if (prevBP ==  bp) {
1406             // Fail for lack of progress.
1407             errln("Reverse Iteration, no progress.  Break Pos=%4d  File line,col=%4d,%4d",
1408                 bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1409             break;
1410         }
1411
1412         // Check that there were we didn't miss an expected break between the last one
1413         //  and this one.  (UVector returns zeros for index out of bounds.)
1414         for (i=prevBP-1; i>bp; i--) {
1415             if (t->expectedBreaks->elementAti(i) != 0) {
1416                 errln("Reverse Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1417                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1418             }
1419         }
1420
1421         // Check that the break we did find was expected
1422         if (t->expectedBreaks->elementAti(bp) == 0) {
1423             errln("Reverse Itertion, break found, but not expected.  Pos=%4d  File line,col= %4d,%4d",
1424                    bp, t->srcLine->elementAti(bp), t->srcCol->elementAti(bp));
1425         } else {
1426             // The break was expected.
1427             //   Check that the {nnn} tag value is correct.
1428             int32_t expectedTagVal = t->expectedBreaks->elementAti(bp);
1429             if (expectedTagVal == -1) {
1430                 expectedTagVal = 0;
1431             }
1432             int line = t->srcLine->elementAti(bp);
1433             int32_t rs = ((RuleBasedBreakIterator *)t->bi)->getRuleStatus();
1434             if (rs != expectedTagVal) {
1435                 errln("Incorrect status for reverse break.  Pos=%4d  File line,col= %4d,%4d.\n"
1436                       "          Actual, Expected status = %4d, %4d",
1437                     bp, line, t->srcCol->elementAti(bp), rs, expectedTagVal);
1438             }
1439         }
1440
1441         prevBP = bp;
1442     }
1443
1444     // Verify that there were no missed breaks prior to the last one found
1445     for (i=prevBP-1; i>=0; i--) {
1446         if (t->expectedBreaks->elementAti(i) != 0) {
1447             errln("Forward Itertion, break expected, but not found.  Pos=%4d  File line,col= %4d,%4d",
1448                       i, t->srcLine->elementAti(i), t->srcCol->elementAti(i));
1449         }
1450     }
1451 }
1452
1453
1454 void RBBITest::TestExtended() {
1455 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1456     UErrorCode      status  = U_ZERO_ERROR;
1457     Locale          locale("");
1458
1459     UnicodeString       rules;
1460     TestParams          tp;
1461     tp.bi             = NULL;
1462     tp.expectedBreaks = new UVector32(status);
1463     tp.srcLine        = new UVector32(status);
1464     tp.srcCol         = new UVector32(status);
1465
1466     RegexMatcher      localeMatcher(UNICODE_STRING_SIMPLE("<locale *([\\p{L}\\p{Nd}_]*) *>"), 0, status);
1467     if (U_FAILURE(status)) {
1468         dataerrln("Failure in file %s, line %d, status = \"%s\"", __FILE__, __LINE__, u_errorName(status));
1469     }
1470
1471
1472     //
1473     //  Open and read the test data file.
1474     //
1475     const char *testDataDirectory = IntlTest::getSourceTestData(status);
1476     char testFileName[1000];
1477     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
1478         errln("Can't open test data.  Path too long.");
1479         return;
1480     }
1481     strcpy(testFileName, testDataDirectory);
1482     strcat(testFileName, "rbbitst.txt");
1483
1484     int    len;
1485     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
1486     if (U_FAILURE(status)) {
1487         return; /* something went wrong, error already output */
1488     }
1489
1490
1491
1492
1493     //
1494     //  Put the test data into a UnicodeString
1495     //
1496     UnicodeString testString(FALSE, testFile, len);
1497
1498     enum EParseState{
1499         PARSE_COMMENT,
1500         PARSE_TAG,
1501         PARSE_DATA,
1502         PARSE_NUM
1503     }
1504     parseState = PARSE_TAG;
1505
1506     EParseState savedState = PARSE_TAG;
1507
1508     static const UChar CH_LF        = 0x0a;
1509     static const UChar CH_CR        = 0x0d;
1510     static const UChar CH_HASH      = 0x23;
1511     /*static const UChar CH_PERIOD    = 0x2e;*/
1512     static const UChar CH_LT        = 0x3c;
1513     static const UChar CH_GT        = 0x3e;
1514     static const UChar CH_BACKSLASH = 0x5c;
1515     static const UChar CH_BULLET    = 0x2022;
1516
1517     int32_t    lineNum  = 1;
1518     int32_t    colStart = 0;
1519     int32_t    column   = 0;
1520     int32_t    charIdx  = 0;
1521
1522     int32_t    tagValue = 0;       // The numeric value of a <nnn> tag.
1523
1524     for (charIdx = 0; charIdx < len; ) {
1525         status = U_ZERO_ERROR;
1526         UChar  c = testString.charAt(charIdx);
1527         charIdx++;
1528         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
1529             // treat CRLF as a unit
1530             c = CH_LF;
1531             charIdx++;
1532         }
1533         if (c == CH_LF || c == CH_CR) {
1534             lineNum++;
1535             colStart = charIdx;
1536         }
1537         column = charIdx - colStart + 1;
1538
1539         switch (parseState) {
1540         case PARSE_COMMENT:
1541             if (c == 0x0a || c == 0x0d) {
1542                 parseState = savedState;
1543             }
1544             break;
1545
1546         case PARSE_TAG:
1547             {
1548             if (c == CH_HASH) {
1549                 parseState = PARSE_COMMENT;
1550                 savedState = PARSE_TAG;
1551                 break;
1552             }
1553             if (u_isUWhiteSpace(c)) {
1554                 break;
1555             }
1556             if (testString.compare(charIdx-1, 6, "<word>") == 0) {
1557                 delete tp.bi;
1558                 tp.bi = BreakIterator::createWordInstance(locale,  status);
1559                 charIdx += 5;
1560                 break;
1561             }
1562             if (testString.compare(charIdx-1, 6, "<char>") == 0) {
1563                 delete tp.bi;
1564                 tp.bi = BreakIterator::createCharacterInstance(locale,  status);
1565                 charIdx += 5;
1566                 break;
1567             }
1568             if (testString.compare(charIdx-1, 6, "<line>") == 0) {
1569                 delete tp.bi;
1570                 tp.bi = BreakIterator::createLineInstance(locale,  status);
1571                 charIdx += 5;
1572                 break;
1573             }
1574             if (testString.compare(charIdx-1, 6, "<sent>") == 0) {
1575                 delete tp.bi;
1576                 tp.bi = NULL;
1577                 tp.bi = BreakIterator::createSentenceInstance(locale,  status);
1578                 charIdx += 5;
1579                 break;
1580             }
1581             if (testString.compare(charIdx-1, 7, "<title>") == 0) {
1582                 delete tp.bi;
1583                 tp.bi = BreakIterator::createTitleInstance(locale,  status);
1584                 charIdx += 6;
1585                 break;
1586             }
1587
1588             // <locale  loc_name>
1589             localeMatcher.reset(testString);
1590             if (localeMatcher.lookingAt(charIdx-1, status)) {
1591                 UnicodeString localeName = localeMatcher.group(1, status);
1592                 char localeName8[100];
1593                 localeName.extract(0, localeName.length(), localeName8, sizeof(localeName8), 0);
1594                 locale = Locale::createFromName(localeName8);
1595                 charIdx += localeMatcher.group(0, status).length();
1596                 TEST_ASSERT_SUCCESS(status);
1597                 break;
1598             }
1599             if (testString.compare(charIdx-1, 6, "<data>") == 0) {
1600                 parseState = PARSE_DATA;
1601                 charIdx += 5;
1602                 tp.dataToBreak = "";
1603                 tp.expectedBreaks->removeAllElements();
1604                 tp.srcCol ->removeAllElements();
1605                 tp.srcLine->removeAllElements();
1606                 break;
1607             }
1608
1609             errln("line %d: Tag expected in test file.", lineNum);
1610             parseState = PARSE_COMMENT;
1611             savedState = PARSE_DATA;
1612             goto end_test; // Stop the test.
1613             }
1614             break;
1615
1616         case PARSE_DATA:
1617             if (c == CH_BULLET) {
1618                 int32_t  breakIdx = tp.dataToBreak.length();
1619                 tp.expectedBreaks->setSize(breakIdx+1);
1620                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1621                 tp.srcLine->setSize(breakIdx+1);
1622                 tp.srcLine->setElementAt(lineNum, breakIdx);
1623                 tp.srcCol ->setSize(breakIdx+1);
1624                 tp.srcCol ->setElementAt(column, breakIdx);
1625                 break;
1626             }
1627
1628             if (testString.compare(charIdx-1, 7, "</data>") == 0) {
1629                 // Add final entry to mappings from break location to source file position.
1630                 //  Need one extra because last break position returned is after the
1631                 //    last char in the data, not at the last char.
1632                 tp.srcLine->addElement(lineNum, status);
1633                 tp.srcCol ->addElement(column, status);
1634
1635                 parseState = PARSE_TAG;
1636                 charIdx += 6;
1637
1638                 // RUN THE TEST!
1639                 executeTest(&tp);
1640                 break;
1641             }
1642
1643             if (testString.compare(charIdx-1, 3, UNICODE_STRING_SIMPLE("\\N{")) == 0) {
1644                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
1645                 // Get the code point from the name and insert it into the test data.
1646                 //   (Damn, no API takes names in Unicode  !!!
1647                 //    we've got to take it back to char *)
1648                 int32_t nameEndIdx = testString.indexOf((UChar)0x7d/*'}'*/, charIdx);
1649                 int32_t nameLength = nameEndIdx - (charIdx+2);
1650                 char charNameBuf[200];
1651                 UChar32 theChar = -1;
1652                 if (nameEndIdx != -1) {
1653                     UErrorCode status = U_ZERO_ERROR;
1654                     testString.extract(charIdx+2, nameLength, charNameBuf, sizeof(charNameBuf));
1655                     charNameBuf[sizeof(charNameBuf)-1] = 0;
1656                     theChar = u_charFromName(U_UNICODE_CHAR_NAME, charNameBuf, &status);
1657                     if (U_FAILURE(status)) {
1658                         theChar = -1;
1659                     }
1660                 }
1661                 if (theChar == -1) {
1662                     errln("Error in named character in test file at line %d, col %d",
1663                         lineNum, column);
1664                 } else {
1665                     // Named code point was recognized.  Insert it
1666                     //   into the test data.
1667                     tp.dataToBreak.append(theChar);
1668                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1669                         tp.srcLine->addElement(lineNum, status);
1670                         tp.srcCol ->addElement(column, status);
1671                     }
1672                 }
1673                 if (nameEndIdx > charIdx) {
1674                     charIdx = nameEndIdx+1;
1675
1676                 }
1677                 break;
1678             }
1679
1680
1681
1682
1683             if (testString.compare(charIdx-1, 2, "<>") == 0) {
1684                 charIdx++;
1685                 int32_t  breakIdx = tp.dataToBreak.length();
1686                 tp.expectedBreaks->setSize(breakIdx+1);
1687                 tp.expectedBreaks->setElementAt(-1, breakIdx);
1688                 tp.srcLine->setSize(breakIdx+1);
1689                 tp.srcLine->setElementAt(lineNum, breakIdx);
1690                 tp.srcCol ->setSize(breakIdx+1);
1691                 tp.srcCol ->setElementAt(column, breakIdx);
1692                 break;
1693             }
1694
1695             if (c == CH_LT) {
1696                 tagValue   = 0;
1697                 parseState = PARSE_NUM;
1698                 break;
1699             }
1700
1701             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
1702                 parseState = PARSE_COMMENT;
1703                 savedState = PARSE_DATA;
1704                 break;
1705             }
1706
1707             if (c == CH_BACKSLASH) {
1708                 // Check for \ at end of line, a line continuation.
1709                 //     Advance over (discard) the newline
1710                 UChar32 cp = testString.char32At(charIdx);
1711                 if (cp == CH_CR && charIdx<len && testString.charAt(charIdx+1) == CH_LF) {
1712                     // We have a CR LF
1713                     //  Need an extra increment of the input ptr to move over both of them
1714                     charIdx++;
1715                 }
1716                 if (cp == CH_LF || cp == CH_CR) {
1717                     lineNum++;
1718                     colStart = charIdx;
1719                     charIdx++;
1720                     break;
1721                 }
1722
1723                 // Let unescape handle the back slash.
1724                 cp = testString.unescapeAt(charIdx);
1725                 if (cp != -1) {
1726                     // Escape sequence was recognized.  Insert the char
1727                     //   into the test data.
1728                     tp.dataToBreak.append(cp);
1729                     while (tp.dataToBreak.length() > tp.srcLine->size()) {
1730                         tp.srcLine->addElement(lineNum, status);
1731                         tp.srcCol ->addElement(column, status);
1732                     }
1733                     break;
1734                 }
1735
1736
1737                 // Not a recognized backslash escape sequence.
1738                 // Take the next char as a literal.
1739                 //  TODO:  Should this be an error?
1740                 c = testString.charAt(charIdx);
1741                 charIdx = testString.moveIndex32(charIdx, 1);
1742             }
1743
1744             // Normal, non-escaped data char.
1745             tp.dataToBreak.append(c);
1746
1747             // Save the mapping from offset in the data to line/column numbers in
1748             //   the original input file.  Will be used for better error messages only.
1749             //   If there's an expected break before this char, the slot in the mapping
1750             //     vector will already be set for this char; don't overwrite it.
1751             if (tp.dataToBreak.length() > tp.srcLine->size()) {
1752                 tp.srcLine->addElement(lineNum, status);
1753                 tp.srcCol ->addElement(column, status);
1754             }
1755             break;
1756
1757
1758         case PARSE_NUM:
1759             // We are parsing an expected numeric tag value, like <1234>,
1760             //   within a chunk of data.
1761             if (u_isUWhiteSpace(c)) {
1762                 break;
1763             }
1764
1765             if (c == CH_GT) {
1766                 // Finished the number.  Add the info to the expected break data,
1767                 //   and switch parse state back to doing plain data.
1768                 parseState = PARSE_DATA;
1769                 if (tagValue == 0) {
1770                     tagValue = -1;
1771                 }
1772                 int32_t  breakIdx = tp.dataToBreak.length();
1773                 tp.expectedBreaks->setSize(breakIdx+1);
1774                 tp.expectedBreaks->setElementAt(tagValue, breakIdx);
1775                 tp.srcLine->setSize(breakIdx+1);
1776                 tp.srcLine->setElementAt(lineNum, breakIdx);
1777                 tp.srcCol ->setSize(breakIdx+1);
1778                 tp.srcCol ->setElementAt(column, breakIdx);
1779                 break;
1780             }
1781
1782             if (u_isdigit(c)) {
1783                 tagValue = tagValue*10 + u_charDigitValue(c);
1784                 break;
1785             }
1786
1787             errln("Syntax Error in test file at line %d, col %d",
1788                 lineNum, column);
1789             parseState = PARSE_COMMENT;
1790             goto end_test; // Stop the test
1791             break;
1792         }
1793
1794
1795         if (U_FAILURE(status)) {
1796             errln("ICU Error %s while parsing test file at line %d.",
1797                 u_errorName(status), lineNum);
1798             status = U_ZERO_ERROR;
1799             goto end_test; // Stop the test
1800         }
1801
1802     }
1803
1804 end_test:
1805     delete tp.bi;
1806     delete tp.expectedBreaks;
1807     delete tp.srcLine;
1808     delete tp.srcCol;
1809     delete [] testFile;
1810 #endif
1811 }
1812
1813 void RBBITest::TestThaiBreaks() {
1814     UErrorCode status=U_ZERO_ERROR;
1815     BreakIterator* b;
1816     Locale locale = Locale("th");
1817     int32_t p, index;
1818     UChar c[]= {
1819             0x0E01, 0x0E39, 0x0020, 0x0E01, 0x0E34, 0x0E19, 0x0E01, 0x0E38, 0x0E49, 0x0E07, 0x0020, 0x0E1B,
1820             0x0E34, 0x0E49, 0x0E48, 0x0E07, 0x0E2D, 0x0E22, 0x0E39, 0x0E48, 0x0E43, 0x0E19,
1821             0x0E16, 0x0E49, 0x0E33, 0x0000
1822     };
1823     int32_t expectedWordResult[] = {
1824             2, 3, 6, 10, 11, 15, 17, 20, 22
1825     };
1826     int32_t expectedLineResult[] = {
1827             3, 6, 11, 15, 17, 20, 22
1828     };
1829
1830     int32_t size = u_strlen(c);
1831     UnicodeString text=UnicodeString(c);
1832
1833     b = BreakIterator::createWordInstance(locale, status);
1834     if (U_FAILURE(status)) {
1835         errcheckln(status, "Unable to create thai word break iterator. - %s", u_errorName(status));
1836         return;
1837     }
1838     b->setText(text);
1839     p = index = 0;
1840     while ((p=b->next())!=BreakIterator::DONE && p < size) {
1841         if (p != expectedWordResult[index++]) {
1842             errln("Incorrect break given by thai word break iterator. Expected: %d  Got: %d", expectedWordResult[index-1], p);
1843         }
1844     }
1845     delete b;
1846
1847     b = BreakIterator::createLineInstance(locale, status);
1848     if (U_FAILURE(status)) {
1849         printf("Unable to create thai line break iterator.\n");
1850         return;
1851     }
1852     b->setText(text);
1853     p = index = 0;
1854     while ((p=b->next())!=BreakIterator::DONE && p < size) {
1855         if (p != expectedLineResult[index++]) {
1856             errln("Incorrect break given by thai line break iterator. Expected: %d  Got: %d", expectedLineResult[index-1], p);
1857         }
1858     }
1859
1860     delete b;
1861 }
1862
1863 // UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
1864 // Words don't include colon or period (cldrbug #1969).
1865 static const char    posxWordText[]     = "Can't have breaks in xx:yy or struct.field for CS-types.";
1866 static const int32_t posxWordTOffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21, 23, 24, 26, 27, 29, 30, 36, 37, 42, 43, 46, 47, 49, 50, 55, 56 };
1867 static const int32_t posxWordROffsets[] = { 5, 6, 10, 11, 17, 18, 20, 21,         26, 27, 29, 30,         42, 43, 46, 47, 49, 50, 55, 56 };
1868
1869 // UBreakIteratorType UBRK_WORD, Locale "ja"
1870 // Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
1871 static const char    jaWordText[]     = "\\u79C1\\u9054\\u306B\\u4E00\\u3007\\u3007\\u3007\\u306E\\u30B3\\u30F3\\u30D4\\u30E5\\u30FC\\u30BF"
1872                                         "\\u304C\\u3042\\u308B\\u3002\\u5948\\u3005\\u306F\\u30EF\\u30FC\\u30C9\\u3067\\u3042\\u308B\\u3002";
1873 static const int32_t jaWordTOffsets[] = {    2, 3,          7, 8, 14,         17, 18,     20, 21, 24,         27, 28 };
1874 static const int32_t jaWordROffsets[] = { 1, 2, 3, 4, 5, 6, 7, 8, 14, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26, 27, 28 };
1875
1876 // UBreakIteratorType UBRK_SENTENCE, Locale "el"
1877 // Add break after Greek question mark (cldrbug #2069).
1878 static const char    elSentText[]     = "\\u0391\\u03B2, \\u03B3\\u03B4; \\u0395 \\u03B6\\u03B7\\u037E \\u0398 \\u03B9\\u03BA. "
1879                                         "\\u039B\\u03BC \\u03BD\\u03BE! \\u039F\\u03C0, \\u03A1\\u03C2? \\u03A3";
1880 static const int32_t elSentTOffsets[] = { 8, 14, 20, 27, 35, 36 };
1881 static const int32_t elSentROffsets[] = {        20, 27, 35, 36 };
1882
1883 // UBreakIteratorType UBRK_CHARACTER, Locale "th"
1884 // Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
1885 static const char    thCharText[]     = "\\u0E01\\u0E23\\u0E30\\u0E17\\u0E48\\u0E2D\\u0E21\\u0E23\\u0E08\\u0E19\\u0E32 "
1886                                         "(\\u0E2A\\u0E38\\u0E0A\\u0E32\\u0E15\\u0E34-\\u0E08\\u0E38\\u0E11\\u0E32\\u0E21\\u0E32\\u0E28) "
1887                                         "\\u0E40\\u0E14\\u0E47\\u0E01\\u0E21\\u0E35\\u0E1B\\u0E31\\u0E0D\\u0E2B\\u0E32 ";
1888 static const int32_t thCharTOffsets[] = { 1, 2, 3, 5, 6, 7, 8, 9, 10, 11,
1889                                           12, 13, 15, 16, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28,
1890                                           29, 30, 32, 33, 35, 37, 38, 39, 40, 41 };
1891 static const int32_t thCharROffsets[] = { 1,    3, 5, 6, 7, 8, 9,     11,
1892                                           12, 13, 15,     17, 19, 20, 22,     24,     26, 27, 28,
1893                                           29,     32, 33, 35, 37, 38,     40, 41 };
1894
1895 typedef struct {
1896     UBreakIteratorType  type;
1897     const char *        locale;
1898     const char *        escapedText;
1899     const int32_t *     tailoredOffsets;
1900     int32_t             tailoredOffsetsCount;
1901     const int32_t *     rootOffsets;
1902     int32_t             rootOffsetsCount;
1903 } TailoredBreakItem;
1904
1905 #define ARRAY_PTR_LEN(array) (array),(sizeof(array)/sizeof(array[0]))
1906
1907 static const TailoredBreakItem tbItems[] = {
1908     { UBRK_WORD,      "en_US_POSIX", posxWordText, ARRAY_PTR_LEN(posxWordTOffsets), ARRAY_PTR_LEN(posxWordROffsets) },
1909     { UBRK_WORD,      "ja",          jaWordText,   ARRAY_PTR_LEN(jaWordTOffsets),   ARRAY_PTR_LEN(jaWordROffsets)   },
1910     { UBRK_SENTENCE,  "el",          elSentText,   ARRAY_PTR_LEN(elSentTOffsets),   ARRAY_PTR_LEN(elSentROffsets)   },
1911     { UBRK_CHARACTER, "th",          thCharText,   ARRAY_PTR_LEN(thCharTOffsets),   ARRAY_PTR_LEN(thCharROffsets)   },
1912     { UBRK_CHARACTER, NULL,          NULL,         NULL,0,                          NULL,0                          } // terminator
1913 };
1914
1915 static void formatOffsets(char* buffer, int32_t buflen, int32_t count, const int32_t* offsets) {
1916     while (count-- > 0) {
1917         int writeCount;
1918         sprintf(buffer, /* buflen, */ " %d%n", *offsets++, &writeCount); /* wants to be snprintf */
1919         buffer += writeCount;
1920         buflen -= writeCount;
1921     }
1922 }
1923
1924 enum { kMaxOffsetCount = 128 };
1925
1926 void RBBITest::TBTest(BreakIterator* brkitr, int type, const char *locale, const char* escapedText, const int32_t *expectOffsets, int32_t expectOffsetsCount) {
1927     brkitr->setText( CharsToUnicodeString(escapedText) );
1928     int32_t foundOffsets[kMaxOffsetCount];
1929     int32_t offset, foundOffsetsCount = 0;
1930     // do forwards iteration test
1931     while ( foundOffsetsCount < kMaxOffsetCount && (offset = brkitr->next()) != BreakIterator::DONE ) {
1932         foundOffsets[foundOffsetsCount++] = offset;
1933     }
1934     if ( foundOffsetsCount != expectOffsetsCount || memcmp(expectOffsets, foundOffsets, foundOffsetsCount*sizeof(foundOffsets[0])) != 0 ) {
1935         // log error for forwards test
1936         char formatExpect[512], formatFound[512];
1937         formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1938         formatOffsets(formatFound, sizeof(formatFound), foundOffsetsCount, foundOffsets);
1939         errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found %d offsets fwd:%s\n",
1940                 type, locale, escapedText, expectOffsetsCount, formatExpect, foundOffsetsCount, formatFound);
1941     } else {
1942         // do backwards iteration test
1943         --foundOffsetsCount; // back off one from the end offset
1944         while ( foundOffsetsCount > 0 ) {
1945             offset = brkitr->previous();
1946             if ( offset != foundOffsets[--foundOffsetsCount] ) {
1947                 // log error for backwards test
1948                 char formatExpect[512];
1949                 formatOffsets(formatExpect, sizeof(formatExpect), expectOffsetsCount, expectOffsets);
1950                 errln("For type %d %-5s, text \"%.16s\"...; expect %d offsets:%s; found rev offset %d where expect %d\n",
1951                         type, locale, escapedText, expectOffsetsCount, formatExpect, offset, foundOffsets[foundOffsetsCount]);
1952                 break;
1953             }
1954         }
1955     }
1956 }
1957
1958 void RBBITest::TestTailoredBreaks() {
1959     const TailoredBreakItem * tbItemPtr;
1960     Locale rootLocale = Locale("root");
1961     for (tbItemPtr = tbItems; tbItemPtr->escapedText != NULL; ++tbItemPtr) {
1962         Locale testLocale = Locale(tbItemPtr->locale);
1963         BreakIterator * tailoredBrkiter = NULL;
1964         BreakIterator * rootBrkiter = NULL;
1965         UErrorCode status = U_ZERO_ERROR;
1966         switch (tbItemPtr->type) {
1967             case UBRK_CHARACTER:
1968                 tailoredBrkiter = BreakIterator::createCharacterInstance(testLocale, status);
1969                 rootBrkiter = BreakIterator::createCharacterInstance(rootLocale, status);
1970                 break;
1971             case UBRK_WORD:
1972                 tailoredBrkiter = BreakIterator::createWordInstance(testLocale, status);
1973                 rootBrkiter = BreakIterator::createWordInstance(rootLocale, status);
1974                 break;
1975             case UBRK_LINE:
1976                 tailoredBrkiter = BreakIterator::createLineInstance(testLocale, status);
1977                 rootBrkiter = BreakIterator::createLineInstance(rootLocale, status);
1978                 break;
1979             case UBRK_SENTENCE:
1980                 tailoredBrkiter = BreakIterator::createSentenceInstance(testLocale, status);
1981                 rootBrkiter = BreakIterator::createSentenceInstance(rootLocale, status);
1982                 break;
1983             default:
1984                 status = U_UNSUPPORTED_ERROR;
1985                 break;
1986         }
1987         if (U_FAILURE(status)) {
1988             errcheckln(status, "BreakIterator create failed for type %d, locales root or %s - Error: %s", (int)(tbItemPtr->type), tbItemPtr->locale, u_errorName(status));
1989             continue;
1990         }
1991         TBTest(tailoredBrkiter, (int)(tbItemPtr->type), tbItemPtr->locale, tbItemPtr->escapedText, tbItemPtr->tailoredOffsets, tbItemPtr->tailoredOffsetsCount);
1992         TBTest(rootBrkiter,     (int)(tbItemPtr->type), "root",            tbItemPtr->escapedText, tbItemPtr->rootOffsets,     tbItemPtr->rootOffsetsCount);
1993
1994         delete rootBrkiter;
1995         delete tailoredBrkiter;
1996     }
1997 }
1998
1999
2000 //-------------------------------------------------------------------------------
2001 //
2002 //  TestDictRules   create a break iterator from source rules that includes a
2003 //                  dictionary range.   Regression for bug #7130.  Source rules
2004 //                  do not declare a break iterator type (word, line, sentence, etc.
2005 //                  but the dictionary code, without a type, would loop.
2006 //
2007 //-------------------------------------------------------------------------------
2008 void RBBITest::TestDictRules() {
2009     const char *rules =  "$dictionary = [a-z]; \n"
2010                          "!!forward; \n"
2011                          "$dictionary $dictionary; \n"
2012                          "!!reverse; \n"
2013                          "$dictionary $dictionary; \n";
2014     const char *text = "aa";
2015     UErrorCode status = U_ZERO_ERROR;
2016     UParseError parseError;
2017
2018     RuleBasedBreakIterator bi(rules, parseError, status);
2019     if (U_SUCCESS(status)) {
2020         UnicodeString utext = text;
2021         bi.setText(utext);
2022         int32_t position;
2023         int32_t loops;
2024         for (loops = 0; loops<10; loops++) {
2025             position = bi.next();
2026             if (position == RuleBasedBreakIterator::DONE) {
2027                 break;
2028             }
2029         }
2030         TEST_ASSERT(loops == 1);
2031     } else {
2032         dataerrln("Error creating RuleBasedBreakIterator: %s", u_errorName(status));
2033     }
2034 }
2035
2036
2037
2038 //-------------------------------------------------------------------------------
2039 //
2040 //    ReadAndConvertFile   Read a text data file, convert it to UChars, and
2041 //    return the datain one big UChar * buffer, which the caller must delete.
2042 //
2043 //    parameters:
2044 //          fileName:   the name of the file, with no directory part.  The test data directory
2045 //                      is assumed.
2046 //          ulen        an out parameter, receives the actual length (in UChars) of the file data.
2047 //          encoding    The file encoding.  If the file contains a BOM, that will override the encoding
2048 //                      specified here.  The BOM, if it exists, will be stripped from the returned data.
2049 //                      Pass NULL for the system default encoding.
2050 //          status
2051 //    returns:
2052 //                      The file data, converted to UChar.
2053 //                      The caller must delete this when done with
2054 //                           delete [] theBuffer;
2055 //
2056 //    TODO:  This is a clone of RegexTest::ReadAndConvertFile.
2057 //           Move this function to some common place.
2058 //
2059 //--------------------------------------------------------------------------------
2060 UChar *RBBITest::ReadAndConvertFile(const char *fileName, int &ulen, const char *encoding, UErrorCode &status) {
2061     UChar       *retPtr  = NULL;
2062     char        *fileBuf = NULL;
2063     UConverter* conv     = NULL;
2064     FILE        *f       = NULL;
2065
2066     ulen = 0;
2067     if (U_FAILURE(status)) {
2068         return retPtr;
2069     }
2070
2071     //
2072     //  Open the file.
2073     //
2074     f = fopen(fileName, "rb");
2075     if (f == 0) {
2076         dataerrln("Error opening test data file %s\n", fileName);
2077         status = U_FILE_ACCESS_ERROR;
2078         return NULL;
2079     }
2080     //
2081     //  Read it in
2082     //
2083     int   fileSize;
2084     int   amt_read;
2085
2086     fseek( f, 0, SEEK_END);
2087     fileSize = ftell(f);
2088     fileBuf = new char[fileSize];
2089     fseek(f, 0, SEEK_SET);
2090     amt_read = fread(fileBuf, 1, fileSize, f);
2091     if (amt_read != fileSize || fileSize <= 0) {
2092         errln("Error reading test data file.");
2093         goto cleanUpAndReturn;
2094     }
2095
2096     //
2097     // Look for a Unicode Signature (BOM) on the data just read
2098     //
2099     int32_t        signatureLength;
2100     const char *   fileBufC;
2101     const char*    bomEncoding;
2102
2103     fileBufC = fileBuf;
2104     bomEncoding = ucnv_detectUnicodeSignature(
2105         fileBuf, fileSize, &signatureLength, &status);
2106     if(bomEncoding!=NULL ){
2107         fileBufC  += signatureLength;
2108         fileSize  -= signatureLength;
2109         encoding = bomEncoding;
2110     }
2111
2112     //
2113     // Open a converter to take the rule file to UTF-16
2114     //
2115     conv = ucnv_open(encoding, &status);
2116     if (U_FAILURE(status)) {
2117         goto cleanUpAndReturn;
2118     }
2119
2120     //
2121     // Convert the rules to UChar.
2122     //  Preflight first to determine required buffer size.
2123     //
2124     ulen = ucnv_toUChars(conv,
2125         NULL,           //  dest,
2126         0,              //  destCapacity,
2127         fileBufC,
2128         fileSize,
2129         &status);
2130     if (status == U_BUFFER_OVERFLOW_ERROR) {
2131         // Buffer Overflow is expected from the preflight operation.
2132         status = U_ZERO_ERROR;
2133
2134         retPtr = new UChar[ulen+1];
2135         ucnv_toUChars(conv,
2136             retPtr,       //  dest,
2137             ulen+1,
2138             fileBufC,
2139             fileSize,
2140             &status);
2141     }
2142
2143 cleanUpAndReturn:
2144     fclose(f);
2145     delete []fileBuf;
2146     ucnv_close(conv);
2147     if (U_FAILURE(status)) {
2148         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
2149         delete retPtr;
2150         retPtr = 0;
2151         ulen   = 0;
2152     };
2153     return retPtr;
2154 }
2155
2156
2157
2158 //--------------------------------------------------------------------------------------------
2159 //
2160 //   Run tests from each of the boundary test data files distributed by the Unicode Consortium
2161 //
2162 //-------------------------------------------------------------------------------------------
2163 void RBBITest::TestUnicodeFiles() {
2164     RuleBasedBreakIterator  *bi;
2165     UErrorCode               status = U_ZERO_ERROR;
2166
2167     bi =  (RuleBasedBreakIterator *)BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2168     TEST_ASSERT_SUCCESS(status);
2169     if (U_SUCCESS(status)) {
2170         runUnicodeTestData("GraphemeBreakTest.txt", bi);
2171     }
2172     delete bi;
2173
2174     bi =  (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), status);
2175     TEST_ASSERT_SUCCESS(status);
2176     if (U_SUCCESS(status)) {
2177         runUnicodeTestData("WordBreakTest.txt", bi);
2178     }
2179     delete bi;
2180
2181     bi =  (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getEnglish(), status);
2182     TEST_ASSERT_SUCCESS(status);
2183     if (U_SUCCESS(status)) {
2184         runUnicodeTestData("SentenceBreakTest.txt", bi);
2185     }
2186     delete bi;
2187
2188     bi =  (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getEnglish(), status);
2189     TEST_ASSERT_SUCCESS(status);
2190     if (U_SUCCESS(status)) {
2191         runUnicodeTestData("LineBreakTest.txt", bi);
2192     }
2193     delete bi;
2194 }
2195
2196
2197 //--------------------------------------------------------------------------------------------
2198 //
2199 //   Run tests from one of the boundary test data files distributed by the Unicode Consortium
2200 //
2201 //-------------------------------------------------------------------------------------------
2202 void RBBITest::runUnicodeTestData(const char *fileName, RuleBasedBreakIterator *bi) {
2203 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2204 // TODO(andy): Match line break behavior to Unicode 6.0 and remove this time bomb.
2205   UVersionInfo icu4700 = { 4, 7, 0, 0 };
2206 UBool isICUVersionPast46 = isICUVersionAtLeast(icu4700);
2207 UBool isLineBreak = 0 == strcmp(fileName, "LineBreakTest.txt");
2208     UErrorCode  status = U_ZERO_ERROR;
2209
2210     //
2211     //  Open and read the test data file, put it into a UnicodeString.
2212     //
2213     const char *testDataDirectory = IntlTest::getSourceTestData(status);
2214     char testFileName[1000];
2215     if (testDataDirectory == NULL || strlen(testDataDirectory) >= sizeof(testFileName)) {
2216         dataerrln("Can't open test data.  Path too long.");
2217         return;
2218     }
2219     strcpy(testFileName, testDataDirectory);
2220     strcat(testFileName, fileName);
2221
2222     logln("Opening data file %s\n", fileName);
2223
2224     int    len;
2225     UChar *testFile = ReadAndConvertFile(testFileName, len, "UTF-8", status);
2226     if (status != U_FILE_ACCESS_ERROR) {
2227         TEST_ASSERT_SUCCESS(status);
2228         TEST_ASSERT(testFile != NULL);
2229     }
2230     if (U_FAILURE(status) || testFile == NULL) {
2231         return; /* something went wrong, error already output */
2232     }
2233     UnicodeString testFileAsString(TRUE, testFile, len);
2234
2235     //
2236     //  Parse the test data file using a regular expression.
2237     //  Each kind of token is recognized in its own capture group; what type of item was scanned
2238     //     is identified by which group had a match.
2239     //
2240     //    Caputure Group #                  1          2            3            4           5
2241     //    Parses this item:               divide       x      hex digits   comment \n  unrecognized \n
2242     //
2243     UnicodeString tokenExpr("[ \t]*(?:(\\u00F7)|(\\u00D7)|([0-9a-fA-F]+)|((?:#.*?)?$.)|(.*?$.))", -1, US_INV);
2244     RegexMatcher    tokenMatcher(tokenExpr, testFileAsString, UREGEX_MULTILINE | UREGEX_DOTALL, status);
2245     UnicodeString   testString;
2246     UVector32       breakPositions(status);
2247     int             lineNumber = 1;
2248     TEST_ASSERT_SUCCESS(status);
2249     if (U_FAILURE(status)) {
2250         return;
2251     }
2252
2253     //
2254     //  Scan through each test case, building up the string to be broken in testString,
2255     //   and the positions that should be boundaries in the breakPositions vector.
2256     //
2257     int spin = 0;
2258     while (tokenMatcher.find()) {
2259         if(tokenMatcher.hitEnd()) {
2260           /* Shouldnt Happen(TM).  This means we didn't find the symbols we were looking for.
2261              This occurred when the text file was corrupt (wasn't marked as UTF-8)
2262              and caused an infinite loop here on EBCDIC systems!
2263           */
2264           fprintf(stderr,"FAIL: hit end of file %s for the %8dth time- corrupt data file?\r", fileName, ++spin);
2265           //       return;
2266         }
2267         if (tokenMatcher.start(1, status) >= 0) {
2268             // Scanned a divide sign, indicating a break position in the test data.
2269             if (testString.length()>0) {
2270                 breakPositions.addElement(testString.length(), status);
2271             }
2272         }
2273         else if (tokenMatcher.start(2, status) >= 0) {
2274             // Scanned an 'x', meaning no break at this position in the test data
2275             //   Nothing to be done here.
2276             }
2277         else if (tokenMatcher.start(3, status) >= 0) {
2278             // Scanned Hex digits.  Convert them to binary, append to the character data string.
2279             const UnicodeString &hexNumber = tokenMatcher.group(3, status);
2280             int length = hexNumber.length();
2281             if (length<=8) {
2282                 char buf[10];
2283                 hexNumber.extract (0, length, buf, sizeof(buf), US_INV);
2284                 UChar32 c = (UChar32)strtol(buf, NULL, 16);
2285                 if (c<=0x10ffff) {
2286                     testString.append(c);
2287                 } else {
2288                     errln("Error: Unicode Character value out of range. \'%s\', line %d.\n",
2289                        fileName, lineNumber);
2290                 }
2291             } else {
2292                 errln("Syntax Error: Hex Unicode Character value must have no more than 8 digits at \'%s\', line %d.\n",
2293                        fileName, lineNumber);
2294              }
2295         }
2296         else if (tokenMatcher.start(4, status) >= 0) {
2297             // Scanned to end of a line, possibly skipping over a comment in the process.
2298             //   If the line from the file contained test data, run the test now.
2299             //
2300             if (testString.length() > 0) {
2301 // TODO(andy): Remove this time bomb code.
2302 if (!isLineBreak || isICUVersionPast46 || !(4658 <= lineNumber && lineNumber <= 4758)) {
2303                 checkUnicodeTestCase(fileName, lineNumber, testString, &breakPositions, bi);
2304 }
2305             }
2306
2307             // Clear out this test case.
2308             //    The string and breakPositions vector will be refilled as the next
2309             //       test case is parsed.
2310             testString.remove();
2311             breakPositions.removeAllElements();
2312             lineNumber++;
2313         } else {
2314             // Scanner catchall.  Something unrecognized appeared on the line.
2315             char token[16];
2316             UnicodeString uToken = tokenMatcher.group(0, status);
2317             uToken.extract(0, uToken.length(), token, (uint32_t)sizeof(token));
2318             token[sizeof(token)-1] = 0;
2319             errln("Syntax error in test data file \'%s\', line %d.  Scanning \"%s\"\n", fileName, lineNumber, token);
2320
2321             // Clean up, in preparation for continuing with the next line.
2322             testString.remove();
2323             breakPositions.removeAllElements();
2324             lineNumber++;
2325         }
2326         TEST_ASSERT_SUCCESS(status);
2327         if (U_FAILURE(status)) {
2328             break;
2329         }
2330     }
2331
2332     delete [] testFile;
2333  #endif   // !UCONFIG_NO_REGULAR_EXPRESSIONS
2334 }
2335
2336 //--------------------------------------------------------------------------------------------
2337 //
2338 //   checkUnicodeTestCase()   Run one test case from one of the Unicode Consortium
2339 //                            test data files.  Do only a simple, forward-only check -
2340 //                            this test is mostly to check that ICU and the Unicode
2341 //                            data agree with each other.
2342 //
2343 //--------------------------------------------------------------------------------------------
2344 void RBBITest::checkUnicodeTestCase(const char *testFileName, int lineNumber,
2345                          const UnicodeString &testString,   // Text data to be broken
2346                          UVector32 *breakPositions,         // Positions where breaks should be found.
2347                          RuleBasedBreakIterator *bi) {
2348     int32_t pos;                 // Break Position in the test string
2349     int32_t expectedI = 0;       // Index of expected break position in the vector of expected results.
2350     int32_t expectedPos;         // Expected break position (index into test string)
2351
2352     bi->setText(testString);
2353     pos = bi->first();
2354     pos = bi->next();
2355
2356     while (pos != BreakIterator::DONE) {
2357         if (expectedI >= breakPositions->size()) {
2358             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2359                 testFileName, lineNumber, pos);
2360             break;
2361         }
2362         expectedPos = breakPositions->elementAti(expectedI);
2363         if (pos < expectedPos) {
2364             errln("Test file \"%s\", line %d, unexpected break found at position %d",
2365                 testFileName, lineNumber, pos);
2366             break;
2367         }
2368         if (pos > expectedPos) {
2369             errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2370                 testFileName, lineNumber, expectedPos);
2371             break;
2372         }
2373         pos = bi->next();
2374         expectedI++;
2375     }
2376
2377     if (pos==BreakIterator::DONE && expectedI<breakPositions->size()) {
2378         errln("Test file \"%s\", line %d, failed to find expected break at position %d",
2379             testFileName, lineNumber, breakPositions->elementAti(expectedI));
2380     }
2381 }
2382
2383
2384
2385 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
2386 //---------------------------------------------------------------------------------------
2387 //
2388 //   classs RBBIMonkeyKind
2389 //
2390 //      Monkey Test for Break Iteration
2391 //      Abstract interface class.   Concrete derived classes independently
2392 //      implement the break rules for different iterator types.
2393 //
2394 //      The Monkey Test itself uses doesn't know which type of break iterator it is
2395 //      testing, but works purely in terms of the interface defined here.
2396 //
2397 //---------------------------------------------------------------------------------------
2398 class RBBIMonkeyKind {
2399 public:
2400     // Return a UVector of UnicodeSets, representing the character classes used
2401     //   for this type of iterator.
2402     virtual  UVector  *charClasses() = 0;
2403
2404     // Set the test text on which subsequent calls to next() will operate
2405     virtual  void      setText(const UnicodeString &s) = 0;
2406
2407     // Find the next break postion, starting from the prev break position, or from zero.
2408     // Return -1 after reaching end of string.
2409     virtual  int32_t   next(int32_t i) = 0;
2410
2411     virtual ~RBBIMonkeyKind();
2412     UErrorCode       deferredStatus;
2413
2414
2415 protected:
2416     RBBIMonkeyKind();
2417
2418 private:
2419 };
2420
2421 RBBIMonkeyKind::RBBIMonkeyKind() {
2422     deferredStatus = U_ZERO_ERROR;
2423 }
2424
2425 RBBIMonkeyKind::~RBBIMonkeyKind() {
2426 }
2427
2428
2429 //----------------------------------------------------------------------------------------
2430 //
2431 //   Random Numbers.  Similar to standard lib rand() and srand()
2432 //                    Not using library to
2433 //                      1.  Get same results on all platforms.
2434 //                      2.  Get access to current seed, to more easily reproduce failures.
2435 //
2436 //---------------------------------------------------------------------------------------
2437 static uint32_t m_seed = 1;
2438
2439 static uint32_t m_rand()
2440 {
2441     m_seed = m_seed * 1103515245 + 12345;
2442     return (uint32_t)(m_seed/65536) % 32768;
2443 }
2444
2445
2446 //------------------------------------------------------------------------------------------
2447 //
2448 //   class RBBICharMonkey      Character (Grapheme Cluster) specific implementation
2449 //                             of RBBIMonkeyKind.
2450 //
2451 //------------------------------------------------------------------------------------------
2452 class RBBICharMonkey: public RBBIMonkeyKind {
2453 public:
2454     RBBICharMonkey();
2455     virtual          ~RBBICharMonkey();
2456     virtual  UVector *charClasses();
2457     virtual  void     setText(const UnicodeString &s);
2458     virtual  int32_t  next(int32_t i);
2459 private:
2460     UVector   *fSets;
2461
2462     UnicodeSet  *fCRLFSet;
2463     UnicodeSet  *fControlSet;
2464     UnicodeSet  *fExtendSet;
2465     UnicodeSet  *fPrependSet;
2466     UnicodeSet  *fSpacingSet;
2467     UnicodeSet  *fLSet;
2468     UnicodeSet  *fVSet;
2469     UnicodeSet  *fTSet;
2470     UnicodeSet  *fLVSet;
2471     UnicodeSet  *fLVTSet;
2472     UnicodeSet  *fHangulSet;
2473     UnicodeSet  *fAnySet;
2474
2475     const UnicodeString *fText;
2476 };
2477
2478
2479 RBBICharMonkey::RBBICharMonkey() {
2480     UErrorCode  status = U_ZERO_ERROR;
2481
2482     fText = NULL;
2483
2484     fCRLFSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\r\\n]"), status);
2485     fControlSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Control}]"), status);
2486     fExtendSet  = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Extend}]"), status);
2487     fPrependSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = Prepend}]"), status);
2488     fSpacingSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = SpacingMark}]"), status);
2489     fLSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = L}]"), status);
2490     fVSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = V}]"), status);
2491     fTSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = T}]"), status);
2492     fLVSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LV}]"), status);
2493     fLVTSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Grapheme_Cluster_Break = LVT}]"), status);
2494     fHangulSet  = new UnicodeSet();
2495     fHangulSet->addAll(*fLSet);
2496     fHangulSet->addAll(*fVSet);
2497     fHangulSet->addAll(*fTSet);
2498     fHangulSet->addAll(*fLVSet);
2499     fHangulSet->addAll(*fLVTSet);
2500     fAnySet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\u0000-\\U0010ffff]"), status);
2501
2502     fSets       = new UVector(status);
2503     fSets->addElement(fCRLFSet,    status);
2504     fSets->addElement(fControlSet, status);
2505     fSets->addElement(fExtendSet,  status);
2506     fSets->addElement(fPrependSet, status);
2507     fSets->addElement(fSpacingSet, status);
2508     fSets->addElement(fHangulSet,  status);
2509     fSets->addElement(fAnySet,     status);
2510     if (U_FAILURE(status)) {
2511         deferredStatus = status;
2512     }
2513 }
2514
2515
2516 void RBBICharMonkey::setText(const UnicodeString &s) {
2517     fText = &s;
2518 }
2519
2520
2521
2522 int32_t RBBICharMonkey::next(int32_t prevPos) {
2523     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2524                               //   break position being tested.  The candidate break
2525                               //   location is before p2.
2526
2527     int     breakPos = -1;
2528
2529     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2530
2531     if (U_FAILURE(deferredStatus)) {
2532         return -1;
2533     }
2534
2535     // Previous break at end of string.  return DONE.
2536     if (prevPos >= fText->length()) {
2537         return -1;
2538     }
2539     p0 = p1 = p2 = p3 = prevPos;
2540     c3 =  fText->char32At(prevPos);
2541     c0 = c1 = c2 = 0;
2542
2543     // Loop runs once per "significant" character position in the input text.
2544     for (;;) {
2545         // Move all of the positions forward in the input string.
2546         p0 = p1;  c0 = c1;
2547         p1 = p2;  c1 = c2;
2548         p2 = p3;  c2 = c3;
2549
2550         // Advancd p3 by one codepoint
2551         p3 = fText->moveIndex32(p3, 1);
2552         c3 = fText->char32At(p3);
2553
2554         if (p1 == p2) {
2555             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2556             continue;
2557         }
2558         if (p2 == fText->length()) {
2559             // Reached end of string.  Always a break position.
2560             break;
2561         }
2562
2563         // Rule  GB3   CR x LF
2564         //     No Extend or Format characters may appear between the CR and LF,
2565         //     which requires the additional check for p2 immediately following p1.
2566         //
2567         if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
2568             continue;
2569         }
2570
2571         // Rule (GB4).   ( Control | CR | LF ) <break>
2572         if (fControlSet->contains(c1) ||
2573             c1 == 0x0D ||
2574             c1 == 0x0A)  {
2575             break;
2576         }
2577
2578         // Rule (GB5)    <break>  ( Control | CR | LF )
2579         //
2580         if (fControlSet->contains(c2) ||
2581             c2 == 0x0D ||
2582             c2 == 0x0A)  {
2583             break;
2584         }
2585
2586
2587         // Rule (GB6)  L x ( L | V | LV | LVT )
2588         if (fLSet->contains(c1) &&
2589                (fLSet->contains(c2)  ||
2590                 fVSet->contains(c2)  ||
2591                 fLVSet->contains(c2) ||
2592                 fLVTSet->contains(c2))) {
2593             continue;
2594         }
2595
2596         // Rule (GB7)    ( LV | V )  x  ( V | T )
2597         if ((fLVSet->contains(c1) || fVSet->contains(c1)) &&
2598             (fVSet->contains(c2) || fTSet->contains(c2)))  {
2599             continue;
2600         }
2601
2602         // Rule (GB8)    ( LVT | T)  x T
2603         if ((fLVTSet->contains(c1) || fTSet->contains(c1)) &&
2604             fTSet->contains(c2))  {
2605             continue;
2606         }
2607
2608         // Rule (GB9)    Numeric x ALetter
2609         if (fExtendSet->contains(c2))  {
2610             continue;
2611         }
2612
2613         // Rule (GB9a)   x  SpacingMark
2614         if (fSpacingSet->contains(c2)) {
2615             continue;
2616         }
2617
2618         // Rule (GB9b)   Prepend x
2619         if (fPrependSet->contains(c1)) {
2620             continue;
2621         }
2622
2623         // Rule (GB10)  Any  <break>  Any
2624         break;
2625     }
2626
2627     breakPos = p2;
2628     return breakPos;
2629 }
2630
2631
2632
2633 UVector  *RBBICharMonkey::charClasses() {
2634     return fSets;
2635 }
2636
2637
2638 RBBICharMonkey::~RBBICharMonkey() {
2639     delete fSets;
2640     delete fCRLFSet;
2641     delete fControlSet;
2642     delete fExtendSet;
2643     delete fPrependSet;
2644     delete fSpacingSet;
2645     delete fLSet;
2646     delete fVSet;
2647     delete fTSet;
2648     delete fLVSet;
2649     delete fLVTSet;
2650     delete fHangulSet;
2651     delete fAnySet;
2652 }
2653
2654 //------------------------------------------------------------------------------------------
2655 //
2656 //   class RBBIWordMonkey      Word Break specific implementation
2657 //                             of RBBIMonkeyKind.
2658 //
2659 //------------------------------------------------------------------------------------------
2660 class RBBIWordMonkey: public RBBIMonkeyKind {
2661 public:
2662     RBBIWordMonkey();
2663     virtual          ~RBBIWordMonkey();
2664     virtual  UVector *charClasses();
2665     virtual  void     setText(const UnicodeString &s);
2666     virtual int32_t   next(int32_t i);
2667 private:
2668     UVector      *fSets;
2669
2670     UnicodeSet  *fCRSet;
2671     UnicodeSet  *fLFSet;
2672     UnicodeSet  *fNewlineSet;
2673     UnicodeSet  *fKatakanaSet;
2674     UnicodeSet  *fALetterSet;
2675     UnicodeSet  *fMidNumLetSet;
2676     UnicodeSet  *fMidLetterSet;
2677     UnicodeSet  *fMidNumSet;
2678     UnicodeSet  *fNumericSet;
2679     UnicodeSet  *fFormatSet;
2680     UnicodeSet  *fOtherSet;
2681     UnicodeSet  *fExtendSet;
2682     UnicodeSet  *fExtendNumLetSet;
2683
2684     RegexMatcher  *fMatcher;
2685
2686     const UnicodeString  *fText;
2687 };
2688
2689
2690 RBBIWordMonkey::RBBIWordMonkey()
2691 {
2692     UErrorCode  status = U_ZERO_ERROR;
2693
2694     fSets            = new UVector(status);
2695
2696     fCRSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = CR}]"),           status);
2697     fLFSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = LF}]"),           status);
2698     fNewlineSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Newline}]"),      status);
2699     fALetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ALetter}]"),      status);
2700     fKatakanaSet     = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Katakana}]"),     status);
2701     fMidNumLetSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNumLet}]"),    status);
2702     fMidLetterSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidLetter}]"),    status);
2703     fMidNumSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = MidNum}]"),       status);
2704     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Numeric}]"),      status);
2705     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Format}]"),       status);
2706     fExtendNumLetSet = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = ExtendNumLet}]"), status);
2707     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Word_Break = Extend}]"),       status);
2708
2709     fOtherSet        = new UnicodeSet();
2710     if(U_FAILURE(status)) {
2711       deferredStatus = status;
2712       return;
2713     }
2714
2715     fOtherSet->complement();
2716     fOtherSet->removeAll(*fCRSet);
2717     fOtherSet->removeAll(*fLFSet);
2718     fOtherSet->removeAll(*fNewlineSet);
2719     fOtherSet->removeAll(*fKatakanaSet);
2720     fOtherSet->removeAll(*fALetterSet);
2721     fOtherSet->removeAll(*fMidLetterSet);
2722     fOtherSet->removeAll(*fMidNumSet);
2723     fOtherSet->removeAll(*fNumericSet);
2724     fOtherSet->removeAll(*fExtendNumLetSet);
2725     fOtherSet->removeAll(*fFormatSet);
2726     fOtherSet->removeAll(*fExtendSet);
2727     // Inhibit dictionary characters from being tested at all.
2728     fOtherSet->removeAll(UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{LineBreak = Complex_Context}]"), status));
2729
2730     fSets->addElement(fCRSet,        status);
2731     fSets->addElement(fLFSet,        status);
2732     fSets->addElement(fNewlineSet,   status);
2733     fSets->addElement(fALetterSet,   status);
2734     fSets->addElement(fKatakanaSet,  status);
2735     fSets->addElement(fMidLetterSet, status);
2736     fSets->addElement(fMidNumLetSet, status);
2737     fSets->addElement(fMidNumSet,    status);
2738     fSets->addElement(fNumericSet,   status);
2739     fSets->addElement(fFormatSet,    status);
2740     fSets->addElement(fExtendSet,    status);
2741     fSets->addElement(fOtherSet,     status);
2742     fSets->addElement(fExtendNumLetSet, status);
2743
2744     if (U_FAILURE(status)) {
2745         deferredStatus = status;
2746     }
2747 }
2748
2749 void RBBIWordMonkey::setText(const UnicodeString &s) {
2750     fText       = &s;
2751 }
2752
2753
2754 int32_t RBBIWordMonkey::next(int32_t prevPos) {
2755     int    p0, p1, p2, p3;    // Indices of the significant code points around the
2756                               //   break position being tested.  The candidate break
2757                               //   location is before p2.
2758
2759     int     breakPos = -1;
2760
2761     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
2762
2763     if (U_FAILURE(deferredStatus)) {
2764         return -1;
2765     }
2766
2767     // Prev break at end of string.  return DONE.
2768     if (prevPos >= fText->length()) {
2769         return -1;
2770     }
2771     p0 = p1 = p2 = p3 = prevPos;
2772     c3 =  fText->char32At(prevPos);
2773     c0 = c1 = c2 = 0;
2774
2775     // Loop runs once per "significant" character position in the input text.
2776     for (;;) {
2777         // Move all of the positions forward in the input string.
2778         p0 = p1;  c0 = c1;
2779         p1 = p2;  c1 = c2;
2780         p2 = p3;  c2 = c3;
2781
2782         // Advancd p3 by    X(Extend | Format)*   Rule 4
2783         //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
2784         do {
2785             p3 = fText->moveIndex32(p3, 1);
2786             c3 = fText->char32At(p3);
2787             if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2788                break;
2789             };
2790         }
2791         while (fFormatSet->contains(c3) || fExtendSet->contains(c3));
2792
2793
2794         if (p1 == p2) {
2795             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
2796             continue;
2797         }
2798         if (p2 == fText->length()) {
2799             // Reached end of string.  Always a break position.
2800             break;
2801         }
2802
2803         // Rule  (3)   CR x LF
2804         //     No Extend or Format characters may appear between the CR and LF,
2805         //     which requires the additional check for p2 immediately following p1.
2806         //
2807         if (c1==0x0D && c2==0x0A) {
2808             continue;
2809         }
2810
2811         // Rule (3a)  Break before and after newlines (including CR and LF)
2812         //
2813         if (fCRSet->contains(c1) || fLFSet->contains(c1) || fNewlineSet->contains(c1)) {
2814             break;
2815         };
2816         if (fCRSet->contains(c2) || fLFSet->contains(c2) || fNewlineSet->contains(c2)) {
2817             break;
2818         };
2819
2820         // Rule (5).   ALetter x ALetter
2821         if (fALetterSet->contains(c1) &&
2822             fALetterSet->contains(c2))  {
2823             continue;
2824         }
2825
2826         // Rule (6)  ALetter  x  (MidLetter | MidNumLet) ALetter
2827         //
2828         if ( fALetterSet->contains(c1)   &&
2829              (fMidLetterSet->contains(c2) || fMidNumLetSet->contains(c2)) &&
2830              fALetterSet->contains(c3)) {
2831             continue;
2832         }
2833
2834
2835         // Rule (7)  ALetter (MidLetter | MidNumLet)  x  ALetter
2836         if (fALetterSet->contains(c0) &&
2837             (fMidLetterSet->contains(c1) ||  fMidNumLetSet->contains(c1)) &&
2838             fALetterSet->contains(c2)) {
2839             continue;
2840         }
2841
2842         // Rule (8)    Numeric x Numeric
2843         if (fNumericSet->contains(c1) &&
2844             fNumericSet->contains(c2))  {
2845             continue;
2846         }
2847
2848         // Rule (9)    ALetter x Numeric
2849         if (fALetterSet->contains(c1) &&
2850             fNumericSet->contains(c2))  {
2851             continue;
2852         }
2853
2854         // Rule (10)    Numeric x ALetter
2855         if (fNumericSet->contains(c1) &&
2856             fALetterSet->contains(c2))  {
2857             continue;
2858         }
2859
2860         // Rule (11)   Numeric (MidNum | MidNumLet)  x  Numeric
2861         if (fNumericSet->contains(c0) &&
2862             (fMidNumSet->contains(c1) || fMidNumLetSet->contains(c1))  &&
2863             fNumericSet->contains(c2)) {
2864             continue;
2865         }
2866
2867         // Rule (12)  Numeric x (MidNum | MidNumLet) Numeric
2868         if (fNumericSet->contains(c1) &&
2869             (fMidNumSet->contains(c2) || fMidNumLetSet->contains(c2))  &&
2870             fNumericSet->contains(c3)) {
2871             continue;
2872         }
2873
2874         // Rule (13)  Katakana x Katakana
2875         if (fKatakanaSet->contains(c1) &&
2876             fKatakanaSet->contains(c2))  {
2877             continue;
2878         }
2879
2880         // Rule 13a
2881         if ((fALetterSet->contains(c1) || fNumericSet->contains(c1) ||
2882              fKatakanaSet->contains(c1) || fExtendNumLetSet->contains(c1)) &&
2883              fExtendNumLetSet->contains(c2)) {
2884                 continue;
2885              }
2886
2887         // Rule 13b
2888         if (fExtendNumLetSet->contains(c1) &&
2889                 (fALetterSet->contains(c2) || fNumericSet->contains(c2) ||
2890                 fKatakanaSet->contains(c2)))  {
2891                 continue;
2892              }
2893
2894         // Rule 14.  Break found here.
2895         break;
2896     }
2897
2898     breakPos = p2;
2899     return breakPos;
2900 }
2901
2902
2903 UVector  *RBBIWordMonkey::charClasses() {
2904     return fSets;
2905 }
2906
2907
2908 RBBIWordMonkey::~RBBIWordMonkey() {
2909     delete fSets;
2910     delete fCRSet;
2911     delete fLFSet;
2912     delete fNewlineSet;
2913     delete fKatakanaSet;
2914     delete fALetterSet;
2915     delete fMidNumLetSet;
2916     delete fMidLetterSet;
2917     delete fMidNumSet;
2918     delete fNumericSet;
2919     delete fFormatSet;
2920     delete fExtendSet;
2921     delete fExtendNumLetSet;
2922     delete fOtherSet;
2923 }
2924
2925
2926
2927
2928 //------------------------------------------------------------------------------------------
2929 //
2930 //   class RBBISentMonkey      Sentence Break specific implementation
2931 //                             of RBBIMonkeyKind.
2932 //
2933 //------------------------------------------------------------------------------------------
2934 class RBBISentMonkey: public RBBIMonkeyKind {
2935 public:
2936     RBBISentMonkey();
2937     virtual          ~RBBISentMonkey();
2938     virtual  UVector *charClasses();
2939     virtual  void     setText(const UnicodeString &s);
2940     virtual int32_t   next(int32_t i);
2941 private:
2942     int               moveBack(int posFrom);
2943     int               moveForward(int posFrom);
2944     UChar32           cAt(int pos);
2945
2946     UVector      *fSets;
2947
2948     UnicodeSet  *fSepSet;
2949     UnicodeSet  *fFormatSet;
2950     UnicodeSet  *fSpSet;
2951     UnicodeSet  *fLowerSet;
2952     UnicodeSet  *fUpperSet;
2953     UnicodeSet  *fOLetterSet;
2954     UnicodeSet  *fNumericSet;
2955     UnicodeSet  *fATermSet;
2956     UnicodeSet  *fSContinueSet;
2957     UnicodeSet  *fSTermSet;
2958     UnicodeSet  *fCloseSet;
2959     UnicodeSet  *fOtherSet;
2960     UnicodeSet  *fExtendSet;
2961
2962     const UnicodeString  *fText;
2963
2964 };
2965
2966 RBBISentMonkey::RBBISentMonkey()
2967 {
2968     UErrorCode  status = U_ZERO_ERROR;
2969
2970     fSets            = new UVector(status);
2971
2972     //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
2973     //                       set and made into character classes of their own.  For the monkey impl,
2974     //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
2975     fSepSet          = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"),     status);
2976     fFormatSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Format}]"),    status);
2977     fSpSet           = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Sp}]"),        status);
2978     fLowerSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Lower}]"),     status);
2979     fUpperSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Upper}]"),     status);
2980     fOLetterSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = OLetter}]"),   status);
2981     fNumericSet      = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Numeric}]"),   status);
2982     fATermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = ATerm}]"),     status);
2983     fSContinueSet    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = SContinue}]"), status);
2984     fSTermSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = STerm}]"),     status);
2985     fCloseSet        = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Close}]"),     status);
2986     fExtendSet       = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Sentence_Break = Extend}]"),    status);
2987     fOtherSet        = new UnicodeSet();
2988
2989     if(U_FAILURE(status)) {
2990       deferredStatus = status;
2991       return;
2992     }
2993
2994     fOtherSet->complement();
2995     fOtherSet->removeAll(*fSepSet);
2996     fOtherSet->removeAll(*fFormatSet);
2997     fOtherSet->removeAll(*fSpSet);
2998     fOtherSet->removeAll(*fLowerSet);
2999     fOtherSet->removeAll(*fUpperSet);
3000     fOtherSet->removeAll(*fOLetterSet);
3001     fOtherSet->removeAll(*fNumericSet);
3002     fOtherSet->removeAll(*fATermSet);
3003     fOtherSet->removeAll(*fSContinueSet);
3004     fOtherSet->removeAll(*fSTermSet);
3005     fOtherSet->removeAll(*fCloseSet);
3006     fOtherSet->removeAll(*fExtendSet);
3007
3008     fSets->addElement(fSepSet,       status);
3009     fSets->addElement(fFormatSet,    status);
3010     fSets->addElement(fSpSet,        status);
3011     fSets->addElement(fLowerSet,     status);
3012     fSets->addElement(fUpperSet,     status);
3013     fSets->addElement(fOLetterSet,   status);
3014     fSets->addElement(fNumericSet,   status);
3015     fSets->addElement(fATermSet,     status);
3016     fSets->addElement(fSContinueSet, status);
3017     fSets->addElement(fSTermSet,     status);
3018     fSets->addElement(fCloseSet,     status);
3019     fSets->addElement(fOtherSet,     status);
3020     fSets->addElement(fExtendSet,    status);
3021
3022     if (U_FAILURE(status)) {
3023         deferredStatus = status;
3024     }
3025 }
3026
3027
3028
3029 void RBBISentMonkey::setText(const UnicodeString &s) {
3030     fText       = &s;
3031 }
3032
3033 UVector  *RBBISentMonkey::charClasses() {
3034     return fSets;
3035 }
3036
3037
3038 //  moveBack()   Find the "significant" code point preceding the index i.
3039 //               Skips over ($Extend | $Format)* .
3040 //
3041 int RBBISentMonkey::moveBack(int i) {
3042     if (i <= 0) {
3043         return -1;
3044     }
3045     UChar32   c;
3046     int32_t   j = i;
3047     do {
3048         j = fText->moveIndex32(j, -1);
3049         c = fText->char32At(j);
3050     }
3051     while (j>0 &&(fFormatSet->contains(c) || fExtendSet->contains(c)));
3052     return j;
3053
3054  }
3055
3056
3057 int RBBISentMonkey::moveForward(int i) {
3058     if (i>=fText->length()) {
3059         return fText->length();
3060     }
3061     UChar32   c;
3062     int32_t   j = i;
3063     do {
3064         j = fText->moveIndex32(j, 1);
3065         c = cAt(j);
3066     }
3067     while (fFormatSet->contains(c) || fExtendSet->contains(c));
3068     return j;
3069 }
3070
3071 UChar32 RBBISentMonkey::cAt(int pos) {
3072     if (pos<0 || pos>=fText->length()) {
3073         return -1;
3074     } else {
3075         return fText->char32At(pos);
3076     }
3077 }
3078
3079 int32_t RBBISentMonkey::next(int32_t prevPos) {
3080     int    p0, p1, p2, p3;    // Indices of the significant code points around the
3081                               //   break position being tested.  The candidate break
3082                               //   location is before p2.
3083
3084     int     breakPos = -1;
3085
3086     UChar32 c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
3087     UChar32 c;
3088
3089     if (U_FAILURE(deferredStatus)) {
3090         return -1;
3091     }
3092
3093     // Prev break at end of string.  return DONE.
3094     if (prevPos >= fText->length()) {
3095         return -1;
3096     }
3097     p0 = p1 = p2 = p3 = prevPos;
3098     c3 =  fText->char32At(prevPos);
3099     c0 = c1 = c2 = 0;
3100
3101     // Loop runs once per "significant" character position in the input text.
3102     for (;;) {
3103         // Move all of the positions forward in the input string.
3104         p0 = p1;  c0 = c1;
3105         p1 = p2;  c1 = c2;
3106         p2 = p3;  c2 = c3;
3107
3108         // Advancd p3 by    X(Extend | Format)*   Rule 4
3109         p3 = moveForward(p3);
3110         c3 = cAt(p3);
3111
3112         // Rule (3)  CR x LF
3113         if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
3114             continue;
3115         }
3116
3117         // Rule (4).   Sep  <break>
3118         if (fSepSet->contains(c1)) {
3119             p2 = p1+1;   // Separators don't combine with Extend or Format.
3120             break;
3121         }
3122
3123         if (p2 >= fText->length()) {
3124             // Reached end of string.  Always a break position.
3125             break;
3126         }
3127
3128         if (p2 == prevPos) {
3129             // Still warming up the loop.  (won't work with zero length strings, but we don't care)
3130             continue;
3131         }
3132
3133         // Rule (6).   ATerm x Numeric
3134         if (fATermSet->contains(c1) &&  fNumericSet->contains(c2))  {
3135             continue;
3136         }
3137
3138         // Rule (7).  Upper ATerm  x  Uppper
3139         if (fUpperSet->contains(c0) && fATermSet->contains(c1) && fUpperSet->contains(c2)) {
3140             continue;
3141         }
3142
3143         // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep | STerm | ATerm))* Lower
3144         //           Note:  STerm | ATerm are added to the negated part of the expression by a
3145         //                  note to the Unicode 5.0 documents.
3146         int p8 = p1;
3147         while (fSpSet->contains(cAt(p8))) {
3148             p8 = moveBack(p8);
3149         }
3150         while (fCloseSet->contains(cAt(p8))) {
3151             p8 = moveBack(p8);
3152         }
3153         if (fATermSet->contains(cAt(p8))) {
3154             p8=p2;
3155             for (;;) {
3156                 c = cAt(p8);
3157                 if (c==-1 || fOLetterSet->contains(c) || fUpperSet->contains(c) ||
3158                     fLowerSet->contains(c) || fSepSet->contains(c) ||
3159                     fATermSet->contains(c) || fSTermSet->contains(c))  {
3160                     break;
3161                 }
3162                 p8 = moveForward(p8);
3163             }
3164             if (fLowerSet->contains(cAt(p8))) {
3165                 continue;
3166             }
3167         }
3168
3169         // Rule 8a   (STerm | ATerm) Close* Sp* x (SContinue | STerm | ATerm);
3170         if (fSContinueSet->contains(c2) || fSTermSet->contains(c2) || fATermSet->contains(c2)) {
3171             p8 = p1;
3172             while (fSpSet->contains(cAt(p8))) {
3173                 p8 = moveBack(p8);
3174             }
3175             while (fCloseSet->contains(cAt(p8))) {
3176                 p8 = moveBack(p8);
3177             }
3178             c = cAt(p8);
3179             if (fSTermSet->contains(c) || fATermSet->contains(c)) {
3180                 continue;
3181             }
3182         }
3183
3184         // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
3185         int p9 = p1;
3186         while (fCloseSet->contains(cAt(p9))) {
3187             p9 = moveBack(p9);
3188         }
3189         c = cAt(p9);
3190         if ((fSTermSet->contains(c) || fATermSet->contains(c))) {
3191             if (fCloseSet->contains(c2) || fSpSet->contains(c2) || fSepSet->contains(c2)) {
3192                 continue;
3193             }
3194         }
3195
3196         // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
3197         int p10 = p1;
3198         while (fSpSet->contains(cAt(p10))) {
3199             p10 = moveBack(p10);
3200         }
3201         while (fCloseSet->contains(cAt(p10))) {
3202             p10 = moveBack(p10);
3203         }
3204         if (fSTermSet->contains(cAt(p10)) || fATermSet->contains(cAt(p10))) {
3205             if (fSpSet->contains(c2) || fSepSet->contains(c2)) {
3206                 continue;
3207             }
3208         }
3209
3210         // Rule (11)  (STerm | ATerm) Close* Sp* (Sep | CR | LF)?  <break>
3211         int p11 = p1;
3212         if (fSepSet->contains(cAt(p11))) {
3213             p11 = moveBack(p11);
3214         }
3215         while (fSpSet->contains(cAt(p11))) {
3216             p11 = moveBack(p11);
3217         }
3218         while (fCloseSet->contains(cAt(p11))) {
3219             p11 = moveBack(p11);
3220         }
3221         if (fSTermSet->contains(cAt(p11)) || fATermSet->contains(cAt(p11))) {
3222             break;
3223         }
3224
3225         //  Rule (12)  Any x Any
3226         continue;
3227     }
3228     breakPos = p2;
3229     return breakPos;
3230 }
3231
3232 RBBISentMonkey::~RBBISentMonkey() {
3233     delete fSets;
3234     delete fSepSet;
3235     delete fFormatSet;
3236     delete fSpSet;
3237     delete fLowerSet;
3238     delete fUpperSet;
3239     delete fOLetterSet;
3240     delete fNumericSet;
3241     delete fATermSet;
3242     delete fSContinueSet;
3243     delete fSTermSet;
3244     delete fCloseSet;
3245     delete fOtherSet;
3246     delete fExtendSet;
3247 }
3248
3249
3250
3251 //-------------------------------------------------------------------------------------------
3252 //
3253 //  RBBILineMonkey
3254 //
3255 //-------------------------------------------------------------------------------------------
3256
3257 class RBBILineMonkey: public RBBIMonkeyKind {
3258 public:
3259     RBBILineMonkey();
3260     virtual          ~RBBILineMonkey();
3261     virtual  UVector *charClasses();
3262     virtual  void     setText(const UnicodeString &s);
3263     virtual  int32_t  next(int32_t i);
3264     virtual  void     rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar);
3265 private:
3266     UVector      *fSets;
3267
3268     UnicodeSet  *fBK;
3269     UnicodeSet  *fCR;
3270     UnicodeSet  *fLF;
3271     UnicodeSet  *fCM;
3272     UnicodeSet  *fNL;
3273     UnicodeSet  *fSG;
3274     UnicodeSet  *fWJ;
3275     UnicodeSet  *fZW;
3276     UnicodeSet  *fGL;
3277     UnicodeSet  *fCB;
3278     UnicodeSet  *fSP;
3279     UnicodeSet  *fB2;
3280     UnicodeSet  *fBA;
3281     UnicodeSet  *fBB;
3282     UnicodeSet  *fHY;
3283     UnicodeSet  *fH2;
3284     UnicodeSet  *fH3;
3285     UnicodeSet  *fCL;
3286     UnicodeSet  *fCP;
3287     UnicodeSet  *fEX;
3288     UnicodeSet  *fIN;
3289     UnicodeSet  *fJL;
3290     UnicodeSet  *fJV;
3291     UnicodeSet  *fJT;
3292     UnicodeSet  *fNS;
3293     UnicodeSet  *fOP;
3294     UnicodeSet  *fQU;
3295     UnicodeSet  *fIS;
3296     UnicodeSet  *fNU;
3297     UnicodeSet  *fPO;
3298     UnicodeSet  *fPR;
3299     UnicodeSet  *fSY;
3300     UnicodeSet  *fAI;
3301     UnicodeSet  *fAL;
3302     UnicodeSet  *fID;
3303     UnicodeSet  *fSA;
3304     UnicodeSet  *fXX;
3305
3306     BreakIterator  *fCharBI;
3307
3308     const UnicodeString  *fText;
3309     int32_t              *fOrigPositions;
3310
3311     RegexMatcher         *fNumberMatcher;
3312     RegexMatcher         *fLB11Matcher;
3313 };
3314
3315
3316 RBBILineMonkey::RBBILineMonkey()
3317 {
3318     UErrorCode  status = U_ZERO_ERROR;
3319
3320     fSets  = new UVector(status);
3321
3322     fBK    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_Break=BK}]"), status);
3323     fCR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CR}]"), status);
3324     fLF    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=LF}]"), status);
3325     fCM    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CM}]"), status);
3326     fNL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NL}]"), status);
3327     fWJ    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=WJ}]"), status);
3328     fZW    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ZW}]"), status);
3329     fGL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=GL}]"), status);
3330     fCB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CB}]"), status);
3331     fSP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SP}]"), status);
3332     fB2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=B2}]"), status);
3333     fBA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BA}]"), status);
3334     fBB    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=BB}]"), status);
3335     fHY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=HY}]"), status);
3336     fH2    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H2}]"), status);
3337     fH3    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=H3}]"), status);
3338     fCL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CL}]"), status);
3339     fCP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=CP}]"), status);
3340     fEX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=EX}]"), status);
3341     fIN    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IN}]"), status);
3342     fJL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JL}]"), status);
3343     fJV    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JV}]"), status);
3344     fJT    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=JT}]"), status);
3345     fNS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NS}]"), status);
3346     fOP    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=OP}]"), status);
3347     fQU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=QU}]"), status);
3348     fIS    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=IS}]"), status);
3349     fNU    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=NU}]"), status);
3350     fPO    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PO}]"), status);
3351     fPR    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=PR}]"), status);
3352     fSY    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SY}]"), status);
3353     fAI    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AI}]"), status);
3354     fAL    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=AL}]"), status);
3355     fID    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=ID}]"), status);
3356     fSA    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=SA}]"), status);
3357     fSG    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\ud800-\\udfff]"), status);
3358     fXX    = new UnicodeSet(UNICODE_STRING_SIMPLE("[\\p{Line_break=XX}]"), status);
3359
3360     if (U_FAILURE(status)) {
3361         deferredStatus = status;
3362         fCharBI = NULL;
3363         fNumberMatcher = NULL;
3364         return;
3365     }
3366
3367     fAL->addAll(*fXX);     // Default behavior for XX is identical to AL
3368     fAL->addAll(*fAI);     // Default behavior for AI is identical to AL
3369     fAL->addAll(*fSA);     // Default behavior for SA is XX, which defaults to AL
3370     fAL->addAll(*fSG);     // Default behavior for SG is identical to AL.
3371
3372     fSets->addElement(fBK, status);
3373     fSets->addElement(fCR, status);
3374     fSets->addElement(fLF, status);
3375     fSets->addElement(fCM, status);
3376     fSets->addElement(fNL, status);
3377     fSets->addElement(fWJ, status);
3378     fSets->addElement(fZW, status);
3379     fSets->addElement(fGL, status);
3380     fSets->addElement(fCB, status);
3381     fSets->addElement(fSP, status);
3382     fSets->addElement(fB2, status);
3383     fSets->addElement(fBA, status);
3384     fSets->addElement(fBB, status);
3385     fSets->addElement(fHY, status);
3386     fSets->addElement(fH2, status);
3387     fSets->addElement(fH3, status);
3388     fSets->addElement(fCL, status);
3389     fSets->addElement(fCP, status);
3390     fSets->addElement(fEX, status);
3391     fSets->addElement(fIN, status);
3392     fSets->addElement(fJL, status);
3393     fSets->addElement(fJT, status);
3394     fSets->addElement(fJV, status);
3395     fSets->addElement(fNS, status);
3396     fSets->addElement(fOP, status);
3397     fSets->addElement(fQU, status);
3398     fSets->addElement(fIS, status);
3399     fSets->addElement(fNU, status);
3400     fSets->addElement(fPO, status);
3401     fSets->addElement(fPR, status);
3402     fSets->addElement(fSY, status);
3403     fSets->addElement(fAI, status);
3404     fSets->addElement(fAL, status);
3405     fSets->addElement(fID, status);
3406     fSets->addElement(fWJ, status);
3407     fSets->addElement(fSA, status);
3408     fSets->addElement(fSG, status);
3409
3410     const char *rules =
3411             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?"
3412             "((\\p{Line_Break=OP}|\\p{Line_Break=HY})\\p{Line_Break=CM}*)?"
3413             "\\p{Line_Break=NU}\\p{Line_Break=CM}*"
3414             "((\\p{Line_Break=NU}|\\p{Line_Break=IS}|\\p{Line_Break=SY})\\p{Line_Break=CM}*)*"
3415             "((\\p{Line_Break=CL}|\\p{Line_Break=CP})\\p{Line_Break=CM}*)?"
3416             "((\\p{Line_Break=PR}|\\p{Line_Break=PO})\\p{Line_Break=CM}*)?";
3417
3418     fNumberMatcher = new RegexMatcher(
3419         UnicodeString(rules, -1, US_INV), 0, status);
3420
3421     fCharBI = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
3422
3423     if (U_FAILURE(status)) {
3424         deferredStatus = status;
3425     }
3426 }
3427
3428
3429 void RBBILineMonkey::setText(const UnicodeString &s) {
3430     fText       = &s;
3431     fCharBI->setText(s);
3432     fNumberMatcher->reset(s);
3433 }
3434
3435 //
3436 //  rule9Adjust
3437 //     Line Break TR rules 9 and 10 implementation.
3438 //     This deals with combining marks and other sequences that
3439 //     that must be treated as if they were something other than what they actually are.
3440 //
3441 //     This is factored out into a separate function because it must be applied twice for
3442 //     each potential break, once to the chars before the position being checked, then
3443 //     again to the text following the possible break.
3444 //
3445 void RBBILineMonkey::rule9Adjust(int32_t pos, UChar32 *posChar, int32_t *nextPos, UChar32 *nextChar) {
3446     if (pos == -1) {
3447         // Invalid initial position.  Happens during the warmup iteration of the
3448         //   main loop in next().
3449         return;
3450     }
3451
3452     int32_t  nPos = *nextPos;
3453
3454     // LB 9  Keep combining sequences together.
3455     //  advance over any CM class chars.  Note that Line Break CM is different
3456     //  from the normal Grapheme Extend property.
3457     if (!(fSP->contains(*posChar) || fBK->contains(*posChar) || *posChar==0x0d ||
3458           *posChar==0x0a ||fNL->contains(*posChar) || fZW->contains(*posChar))) {
3459         for (;;) {
3460             *nextChar = fText->char32At(nPos);
3461             if (!fCM->contains(*nextChar)) {
3462                 break;
3463             }
3464             nPos = fText->moveIndex32(nPos, 1);
3465         }
3466     }
3467
3468
3469     // LB 9 Treat X CM* as if it were x.
3470     //       No explicit action required.
3471
3472     // LB 10  Treat any remaining combining mark as AL
3473     if (fCM->contains(*posChar)) {
3474         *posChar = 0x41;   // thisChar = 'A';
3475     }
3476
3477     // Push the updated nextPos and nextChar back to our caller.
3478     // This only makes a difference if posChar got bigger by consuming a
3479     // combining sequence.
3480     *nextPos  = nPos;
3481     *nextChar = fText->char32At(nPos);
3482 }
3483
3484
3485
3486 int32_t RBBILineMonkey::next(int32_t startPos) {
3487     UErrorCode status = U_ZERO_ERROR;
3488     int32_t    pos;       //  Index of the char following a potential break position
3489     UChar32    thisChar;  //  Character at above position "pos"
3490
3491     int32_t    prevPos;   //  Index of the char preceding a potential break position
3492     UChar32    prevChar;  //  Character at above position.  Note that prevChar
3493                           //   and thisChar may not be adjacent because combining
3494                           //   characters between them will be ignored.
3495
3496     int32_t    nextPos;   //  Index of the next character following pos.
3497                           //     Usually skips over combining marks.
3498     int32_t    nextCPPos; //  Index of the code point following "pos."
3499                           //     May point to a combining mark.
3500     int32_t    tPos;      //  temp value.
3501     UChar32    c;
3502
3503     if (U_FAILURE(deferredStatus)) {
3504         return -1;
3505     }
3506
3507     if (startPos >= fText->length()) {
3508         return -1;
3509     }
3510
3511
3512     // Initial values for loop.  Loop will run the first time without finding breaks,
3513     //                           while the invalid values shift out and the "this" and
3514     //                           "prev" positions are filled in with good values.
3515     pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
3516     thisChar = prevChar  = 0;
3517     nextPos  = nextCPPos = startPos;
3518
3519
3520     // Loop runs once per position in the test text, until a break position
3521     //  is found.
3522     for (;;) {
3523         prevPos   = pos;
3524         prevChar  = thisChar;
3525
3526         pos       = nextPos;
3527         thisChar  = fText->char32At(pos);
3528
3529         nextCPPos = fText->moveIndex32(pos, 1);
3530         nextPos   = nextCPPos;
3531
3532         // Rule LB2 - Break at end of text.
3533         if (pos >= fText->length()) {
3534             break;
3535         }
3536
3537         // Rule LB 9 - adjust for combining sequences.
3538         //             We do this one out-of-order because the adjustment does not change anything
3539         //             that would match rules LB 3 - LB 6, but after the adjustment, LB 3-6 do need to
3540         //             be applied.
3541         rule9Adjust(prevPos, &prevChar, &pos,     &thisChar);
3542         nextCPPos = nextPos = fText->moveIndex32(pos, 1);
3543         c = fText->char32At(nextPos);
3544         rule9Adjust(pos,     &thisChar, &nextPos, &c);
3545
3546         // If the loop is still warming up - if we haven't shifted the initial
3547         //   -1 positions out of prevPos yet - loop back to advance the
3548         //    position in the input without any further looking for breaks.
3549         if (prevPos == -1) {
3550             continue;
3551         }
3552
3553         // LB 4  Always break after hard line breaks,
3554         if (fBK->contains(prevChar)) {
3555             break;
3556         }
3557
3558         // LB 5  Break after CR, LF, NL, but not inside CR LF
3559         if (prevChar == 0x0d && thisChar == 0x0a) {
3560             continue;
3561         }
3562         if (prevChar == 0x0d ||
3563             prevChar == 0x0a ||
3564             prevChar == 0x85)  {
3565             break;
3566         }
3567
3568         // LB 6  Don't break before hard line breaks
3569         if (thisChar == 0x0d || thisChar == 0x0a || thisChar == 0x85 ||
3570             fBK->contains(thisChar)) {
3571                 continue;
3572         }
3573
3574
3575         // LB 7  Don't break before spaces or zero-width space.
3576         if (fSP->contains(thisChar)) {
3577             continue;
3578         }
3579
3580         if (fZW->contains(thisChar)) {
3581             continue;
3582         }
3583
3584         // LB 8  Break after zero width space
3585         if (fZW->contains(prevChar)) {
3586             break;
3587         }
3588
3589         // LB 9, 10  Already done, at top of loop.
3590         //
3591
3592
3593         // LB 11  Do not break before or after WORD JOINER and related characters.
3594         //    x  WJ
3595         //    WJ  x
3596         //
3597         if (fWJ->contains(thisChar) || fWJ->contains(prevChar)) {
3598             continue;
3599         }
3600
3601         // LB 12
3602         //    GL  x
3603         if (fGL->contains(prevChar)) {
3604             continue;
3605         }
3606
3607         // LB 12a
3608         //    [^SP BA HY] x GL
3609         if (!(fSP->contains(prevChar) ||
3610               fBA->contains(prevChar) ||
3611               fHY->contains(prevChar)     ) && fGL->contains(thisChar)) {
3612             continue;
3613         }
3614
3615
3616
3617         // LB 13  Don't break before closings.
3618         //        NU x CL,  NU x CP  and NU x IS are not matched here so that they will
3619         //        fall into LB 17 and the more general number regular expression.
3620         //
3621         if ((!fNU->contains(prevChar) && fCL->contains(thisChar)) ||
3622             (!fNU->contains(prevChar) && fCP->contains(thisChar)) ||
3623                                          fEX->contains(thisChar)  ||
3624             (!fNU->contains(prevChar) && fIS->contains(thisChar)) ||
3625             (!fNU->contains(prevChar) && fSY->contains(thisChar)))    {
3626             continue;
3627         }
3628
3629         // LB 14 Don't break after OP SP*
3630         //       Scan backwards, checking for this sequence.
3631         //       The OP char could include combining marks, so we actually check for
3632         //           OP CM* SP*
3633         //       Another Twist: The Rule 67 fixes may have changed a SP CM
3634         //       sequence into a ID char, so before scanning back through spaces,
3635         //       verify that prevChar is indeed a space.  The prevChar variable
3636         //       may differ from fText[prevPos]
3637         tPos = prevPos;
3638         if (fSP->contains(prevChar)) {
3639             while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3640                 tPos=fText->moveIndex32(tPos, -1);
3641             }
3642         }
3643         while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3644             tPos=fText->moveIndex32(tPos, -1);
3645         }
3646         if (fOP->contains(fText->char32At(tPos))) {
3647             continue;
3648         }
3649
3650
3651         // LB 15    QU SP* x OP
3652         if (fOP->contains(thisChar)) {
3653             // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
3654             int tPos = prevPos;
3655             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3656                 tPos = fText->moveIndex32(tPos, -1);
3657             }
3658             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3659                 tPos = fText->moveIndex32(tPos, -1);
3660             }
3661             if (fQU->contains(fText->char32At(tPos))) {
3662                 continue;
3663             }
3664         }
3665
3666
3667
3668         // LB 16   (CL | CP) SP* x NS
3669         //    Scan backwards for SP* CM* (CL | CP)
3670         if (fNS->contains(thisChar)) {
3671             int tPos = prevPos;
3672             while (tPos>0 && fSP->contains(fText->char32At(tPos))) {
3673                 tPos = fText->moveIndex32(tPos, -1);
3674             }
3675             while (tPos>0 && fCM->contains(fText->char32At(tPos))) {
3676                 tPos = fText->moveIndex32(tPos, -1);
3677             }
3678             if (fCL->contains(fText->char32At(tPos)) || fCP->contains(fText->char32At(tPos))) {
3679                 continue;
3680             }
3681         }
3682
3683
3684         // LB 17        B2 SP* x B2
3685         if (fB2->contains(thisChar)) {
3686             //  Scan backwards, checking for the B2 CM* SP* sequence.
3687             tPos = prevPos;
3688             if (fSP->contains(prevChar)) {
3689                 while (tPos > 0 && fSP->contains(fText->char32At(tPos))) {
3690                     tPos=fText->moveIndex32(tPos, -1);
3691                 }
3692             }
3693             while (tPos > 0 && fCM->contains(fText->char32At(tPos))) {
3694                 tPos=fText->moveIndex32(tPos, -1);
3695             }
3696             if (fB2->contains(fText->char32At(tPos))) {
3697                 continue;
3698             }
3699         }
3700
3701
3702         // LB 18    break after space
3703         if (fSP->contains(prevChar)) {
3704             break;
3705         }
3706
3707         // LB 19
3708         //    x   QU
3709         //    QU  x
3710         if (fQU->contains(thisChar) || fQU->contains(prevChar)) {
3711             continue;
3712         }
3713
3714         // LB 20  Break around a CB
3715         if (fCB->contains(thisChar) || fCB->contains(prevChar)) {
3716             break;
3717         }
3718
3719         // LB 21
3720         if (fBA->contains(thisChar) ||
3721             fHY->contains(thisChar) ||
3722             fNS->contains(thisChar) ||
3723             fBB->contains(prevChar) )   {
3724             continue;
3725         }
3726
3727         // LB 22
3728         if ((fAL->contains(prevChar) && fIN->contains(thisChar)) ||
3729             (fID->contains(prevChar) && fIN->contains(thisChar)) ||
3730             (fIN->contains(prevChar) && fIN->contains(thisChar)) ||
3731             (fNU->contains(prevChar) && fIN->contains(thisChar)) )   {
3732             continue;
3733         }
3734
3735
3736         // LB 23    ID x PO
3737         //          AL x NU
3738         //          NU x AL
3739         if ((fID->contains(prevChar) && fPO->contains(thisChar)) ||
3740             (fAL->contains(prevChar) && fNU->contains(thisChar)) ||
3741             (fNU->contains(prevChar) && fAL->contains(thisChar)) )   {
3742             continue;
3743         }
3744
3745         // LB 24  Do not break between prefix and letters or ideographs.
3746         //        PR x ID
3747         //        PR x AL
3748         //        PO x AL
3749         if ((fPR->contains(prevChar) && fID->contains(thisChar)) ||
3750             (fPR->contains(prevChar) && fAL->contains(thisChar)) ||
3751             (fPO->contains(prevChar) && fAL->contains(thisChar)) )   {
3752             continue;
3753         }
3754
3755
3756
3757         // LB 25    Numbers
3758         if (fNumberMatcher->lookingAt(prevPos, status)) {
3759             if (U_FAILURE(status)) {
3760                 break;
3761             }
3762             // Matched a number.  But could have been just a single digit, which would
3763             //    not represent a "no break here" between prevChar and thisChar
3764             int32_t numEndIdx = fNumberMatcher->end(status);  // idx of first char following num
3765             if (numEndIdx > pos) {
3766                 // Number match includes at least our two chars being checked
3767                 if (numEndIdx > nextPos) {
3768                     // Number match includes additional chars.  Update pos and nextPos
3769                     //   so that next loop iteration will continue at the end of the number,
3770                     //   checking for breaks between last char in number & whatever follows.
3771                     pos = nextPos = numEndIdx;
3772                     do {
3773                         pos = fText->moveIndex32(pos, -1);
3774                         thisChar = fText->char32At(pos);
3775                     } while (fCM->contains(thisChar));
3776                 }
3777                 continue;
3778             }
3779         }
3780
3781
3782         // LB 26 Do not break a Korean syllable.
3783         if (fJL->contains(prevChar) && (fJL->contains(thisChar) ||
3784                                         fJV->contains(thisChar) ||
3785                                         fH2->contains(thisChar) ||
3786                                         fH3->contains(thisChar))) {
3787                                             continue;
3788                                         }
3789
3790         if ((fJV->contains(prevChar) || fH2->contains(prevChar))  &&
3791             (fJV->contains(thisChar) || fJT->contains(thisChar))) {
3792                 continue;
3793         }
3794
3795         if ((fJT->contains(prevChar) || fH3->contains(prevChar)) &&
3796             fJT->contains(thisChar)) {
3797                 continue;
3798         }
3799
3800         // LB 27 Treat a Korean Syllable Block the same as ID.
3801         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3802             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3803             fIN->contains(thisChar)) {
3804                 continue;
3805             }
3806         if ((fJL->contains(prevChar) || fJV->contains(prevChar) ||
3807             fJT->contains(prevChar) || fH2->contains(prevChar) || fH3->contains(prevChar)) &&
3808             fPO->contains(thisChar)) {
3809                 continue;
3810             }
3811         if (fPR->contains(prevChar) && (fJL->contains(thisChar) || fJV->contains(thisChar) ||
3812             fJT->contains(thisChar) || fH2->contains(thisChar) || fH3->contains(thisChar))) {
3813                 continue;
3814             }
3815
3816
3817
3818         // LB 28  Do not break between alphabetics ("at").
3819         if (fAL->contains(prevChar) && fAL->contains(thisChar)) {
3820             continue;
3821         }
3822
3823         // LB 29  Do not break between numeric punctuation and alphabetics ("e.g.").
3824         if (fIS->contains(prevChar) && fAL->contains(thisChar)) {
3825             continue;
3826         }
3827
3828         // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
3829         //          (AL | NU) x OP
3830         //          CP x (AL | NU)
3831         if ((fAL->contains(prevChar) || fNU->contains(prevChar)) && fOP->contains(thisChar)) {
3832             continue;
3833         }
3834         if (fCP->contains(prevChar) && (fAL->contains(thisChar) || fNU->contains(thisChar))) {
3835             continue;
3836         }
3837
3838         // LB 31    Break everywhere else
3839         break;
3840
3841     }
3842
3843     return pos;
3844 }
3845
3846
3847 UVector  *RBBILineMonkey::charClasses() {
3848     return fSets;
3849 }
3850
3851
3852 RBBILineMonkey::~RBBILineMonkey() {
3853     delete fSets;
3854
3855     delete fBK;
3856     delete fCR;
3857     delete fLF;
3858     delete fCM;
3859     delete fNL;
3860     delete fWJ;
3861     delete fZW;
3862     delete fGL;
3863     delete fCB;
3864     delete fSP;
3865     delete fB2;
3866     delete fBA;
3867     delete fBB;
3868     delete fHY;
3869     delete fH2;
3870     delete fH3;
3871     delete fCL;
3872     delete fCP;
3873     delete fEX;
3874     delete fIN;
3875     delete fJL;
3876     delete fJV;
3877     delete fJT;
3878     delete fNS;
3879     delete fOP;
3880     delete fQU;
3881     delete fIS;
3882     delete fNU;
3883     delete fPO;
3884     delete fPR;
3885     delete fSY;
3886     delete fAI;
3887     delete fAL;
3888     delete fID;
3889     delete fSA;
3890     delete fSG;
3891     delete fXX;
3892
3893     delete fCharBI;
3894     delete fNumberMatcher;
3895 }
3896
3897
3898 //-------------------------------------------------------------------------------------------
3899 //
3900 //   TestMonkey
3901 //
3902 //     params
3903 //       seed=nnnnn        Random number starting seed.
3904 //                         Setting the seed allows errors to be reproduced.
3905 //       loop=nnn          Looping count.  Controls running time.
3906 //                         -1:  run forever.
3907 //                          0 or greater:  run length.
3908 //
3909 //       type = char | word | line | sent | title
3910 //
3911 //-------------------------------------------------------------------------------------------
3912
3913 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
3914     int32_t val = defaultVal;
3915     name.append(" *= *(-?\\d+)");
3916     UErrorCode status = U_ZERO_ERROR;
3917     RegexMatcher m(name, params, 0, status);
3918     if (m.find()) {
3919         // The param exists.  Convert the string to an int.
3920         char valString[100];
3921         int32_t paramLength = m.end(1, status) - m.start(1, status);
3922         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
3923             paramLength = (int32_t)(sizeof(valString)-2);
3924         }
3925         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
3926         val = strtol(valString,  NULL, 10);
3927
3928         // Delete this parameter from the params string.
3929         m.reset();
3930         params = m.replaceFirst("", status);
3931     }
3932     U_ASSERT(U_SUCCESS(status));
3933     return val;
3934 }
3935 #endif
3936
3937 static void testBreakBoundPreceding(RBBITest *test, UnicodeString ustr,
3938                                     BreakIterator *bi,
3939                                     int expected[],
3940                                     int expectedcount)
3941 {
3942     int count = 0;
3943     int i = 0;
3944     int forward[50];
3945     bi->setText(ustr);
3946     for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
3947         forward[count] = i;
3948         if (count < expectedcount && expected[count] != i) {
3949             test->errln("break forward test failed: expected %d but got %d",
3950                         expected[count], i);
3951             break;
3952         }
3953         count ++;
3954     }
3955     if (count != expectedcount) {
3956         printStringBreaks(ustr, expected, expectedcount);
3957         test->errln("break forward test failed: missed %d match",
3958                     expectedcount - count);
3959         return;
3960     }
3961     // testing boundaries
3962     for (i = 1; i < expectedcount; i ++) {
3963         int j = expected[i - 1];
3964         if (!bi->isBoundary(j)) {
3965             printStringBreaks(ustr, expected, expectedcount);
3966             test->errln("isBoundary() failed.  Expected boundary at position %d", j);
3967             return;
3968         }
3969         for (j = expected[i - 1] + 1; j < expected[i]; j ++) {
3970             if (bi->isBoundary(j)) {
3971                 printStringBreaks(ustr, expected, expectedcount);
3972                 test->errln("isBoundary() failed.  Not expecting boundary at position %d", j);
3973                 return;
3974             }
3975         }
3976     }
3977
3978     for (i = bi->last(); i != BreakIterator::DONE; i = bi->previous()) {
3979         count --;
3980         if (forward[count] != i) {
3981             test->errln("happy break test previous() failed: expected %d but got %d",
3982                         forward[count], i);
3983             break;
3984         }
3985     }
3986     if (count != 0) {
3987         printStringBreaks(ustr, expected, expectedcount);
3988         test->errln("break test previous() failed: missed a match");
3989         return;
3990     }
3991
3992     // testing preceding
3993     for (i = 0; i < expectedcount - 1; i ++) {
3994         // int j = expected[i] + 1;
3995         int j = ustr.moveIndex32(expected[i], 1);
3996         for (; j <= expected[i + 1]; j ++) {
3997             if (bi->preceding(j) != expected[i]) {
3998                 printStringBreaks(ustr, expected, expectedcount);
3999                 test->errln("preceding(): Not expecting boundary at position %d", j);
4000                 return;
4001             }
4002         }
4003     }
4004 }
4005
4006 void RBBITest::TestWordBreaks(void)
4007 {
4008 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4009
4010     Locale        locale("en");
4011     UErrorCode    status = U_ZERO_ERROR;
4012     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4013     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4014     static const char *strlist[] =
4015     {
4016     "\\U000e0032\\u0097\\u0f94\\uc2d8\\u05f4\\U000e0031\\u060d",
4017     "\\U000e0037\\u4666\\u1202\\u003a\\U000e0031\\u064d\\u0bea\\u591c\\U000e0040\\u003b",
4018     "\\u0589\\u3e99\\U0001d7f3\\U000e0074\\u1810\\u200e\\U000e004b\\u0027\\U000e0061\\u003a",
4019     "\\u398c\\U000104a5\\U0001d173\\u102d\\u002e\\uca3b\\u002e\\u002c\\u5622",
4020     "\\u90ca\\u3588\\u009c\\u0953\\u194b",
4021     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4022     "\\u0602\\u2019\\ua191\\U000e0063\\u0a4c\\u003a\\ub4b5\\u003a\\u827f\\u002e",
4023     "\\u7f1f\\uc634\\u65f8\\u0944\\u04f2\\uacdf\\u1f9c\\u05f4\\u002e",
4024     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4025     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4026     "\\u2027\\U000e0067\\u0a47\\u00b7",
4027     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4028     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4029     "\\u0589\\U000e006e\\u0a42\\U000104a5",
4030     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4031     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4032     "\\u0027\\u11af\\U000e0057\\u0602",
4033     "\\U0001d7f2\\U000e007\\u0004\\u0589",
4034     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4035     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4036     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4037     "\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4038     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4039     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4040     "\\u0233\\U000e0020\\u0a69\\u0d6a",
4041     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4042     "\\u58f4\\U000e0049\\u20e7\\u2027",
4043     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4044     "\\ua183\\u102d\\u0bec\\u003a",
4045     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4046     "\\u003a\\u0e57\\u0fad\\u002e",
4047     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4048     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4049     "\\U000e005d\\u2044\\u0731\\u0650\\u0061",
4050     "\\u003a\\u0664\\u00b7\\u1fba",
4051     "\\u003b\\u0027\\u00b7\\u47a3",
4052     "\\u2027\\U000e0067\\u0a42\\u00b7\\ubddf\\uc26c\\u003a\\u4186\\u041b",
4053     "\\u0027\\u003a\\U0001d70f\\U0001d7df\\ubf4a\\U0001d7f5\\U0001d177\\u003a\\u0e51\\u1058\\U000e0058\\u00b7\\u0673",
4054     "\\uc30d\\u002e\\U000e002c\\u0c48\\u003a\\ub5a1\\u0661\\u002c",
4055     };
4056     int loop;
4057     if (U_FAILURE(status)) {
4058         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4059         return;
4060     }
4061     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4062         // printf("looping %d\n", loop);
4063         UnicodeString ustr = CharsToUnicodeString(strlist[loop]);
4064         // RBBICharMonkey monkey;
4065         RBBIWordMonkey monkey;
4066
4067         int expected[50];
4068         int expectedcount = 0;
4069
4070         monkey.setText(ustr);
4071         int i;
4072         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4073             expected[expectedcount ++] = i;
4074         }
4075
4076         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4077     }
4078     delete bi;
4079 #endif
4080 }
4081
4082 void RBBITest::TestWordBoundary(void)
4083 {
4084     // <data><>\u1d4a\u206e<?>\u0603\U0001d7ff<>\u2019<></data>
4085     Locale        locale("en");
4086     UErrorCode    status = U_ZERO_ERROR;
4087     // BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4088     BreakIterator *bi = BreakIterator::createWordInstance(locale, status);
4089     UChar         str[50];
4090     static const char *strlist[] =
4091     {
4092     "\\u200e\\U000e0072\\u0a4b\\U000e003f\\ufd2b\\u2027\\u002e\\u002e",
4093     "\\U000e0042\\u002e\\u0fb8\\u09ef\\u0ed1\\u2044",
4094     "\\u003b\\u024a\\u102e\\U000e0071\\u0600",
4095     "\\u2027\\U000e0067\\u0a47\\u00b7",
4096     "\\u1fcd\\u002c\\u07aa\\u0027\\u11b0",
4097     "\\u002c\\U000e003c\\U0001d7f4\\u003a\\u0c6f\\u0027",
4098     "\\u0589\\U000e006e\\u0a42\\U000104a5",
4099     "\\u4f66\\ub523\\u003a\\uacae\\U000e0047\\u003a",
4100     "\\u003a\\u0f21\\u0668\\u0dab\\u003a\\u0655\\u00b7",
4101     "\\u0027\\u11af\\U000e0057\\u0602",
4102     "\\U0001d7f2\\U000e007\\u0004\\u0589",
4103     "\\U000e0022\\u003a\\u10b3\\u003a\\ua21b\\u002e\\U000e0058\\u1732\\U000e002b",
4104     "\\U0001d7f2\\U000e007d\\u0004\\u0589",
4105     "\\u82ab\\u17e8\\u0736\\u2019\\U0001d64d",
4106     "\\u0e01\\ub55c\\u0a68\\U000e0037\\u0cd6\\u002c\\ub959",
4107     "\\U000e0065\\u302c\\uc986\\u09ee\\U000e0068",
4108     "\\u0be8\\u002e\\u0c68\\u066e\\u136d\\ufc99\\u59e7",
4109     "\\u0233\\U000e0020\\u0a69\\u0d6a",
4110     "\\u206f\\u0741\\ub3ab\\u2019\\ubcac\\u2019",
4111     "\\u58f4\\U000e0049\\u20e7\\u2027",
4112     "\\ub315\\U0001d7e5\\U000e0073\\u0c47\\u06f2\\u0c6a\\u0037\\u10fe",
4113     "\\ua183\\u102d\\u0bec\\u003a",
4114     "\\u17e8\\u06e7\\u002e\\u096d\\u003b",
4115     "\\u003a\\u0e57\\u0fad\\u002e",
4116     "\\u002e\\U000e004c\\U0001d7ea\\u05bb\\ud0fd\\u02de",
4117     "\\u32e6\\U0001d7f6\\u0fa1\\u206a\\U000e003c\\u0cec\\u003a",
4118     "\\ua2a5\\u0038\\u2044\\u002e\\u0c67\\U000e003c\\u05f4\\u2027\\u05f4\\u2019",
4119     "\\u003a\\u0664\\u00b7\\u1fba",
4120     "\\u003b\\u0027\\u00b7\\u47a3",
4121     };
4122     int loop;
4123     if (U_FAILURE(status)) {
4124         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4125         return;
4126     }
4127     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4128         // printf("looping %d\n", loop);
4129         u_unescape(strlist[loop], str, 20);
4130         UnicodeString ustr(str);
4131         int forward[50];
4132         int count = 0;
4133
4134         bi->setText(ustr);
4135         int prev = 0;
4136         int i;
4137         for (i = bi->first(); i != BreakIterator::DONE; i = bi->next()) {
4138             forward[count ++] = i;
4139             if (i > prev) {
4140                 int j;
4141                 for (j = prev + 1; j < i; j ++) {
4142                     if (bi->isBoundary(j)) {
4143                         printStringBreaks(ustr, forward, count);
4144                         errln("happy boundary test failed: expected %d not a boundary",
4145                                j);
4146                         return;
4147                     }
4148                 }
4149             }
4150             if (!bi->isBoundary(i)) {
4151                 printStringBreaks(ustr, forward, count);
4152                 errln("happy boundary test failed: expected %d a boundary",
4153                        i);
4154                 return;
4155             }
4156             prev = i;
4157         }
4158     }
4159     delete bi;
4160 }
4161
4162 void RBBITest::TestLineBreaks(void)
4163 {
4164 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4165     Locale        locale("en");
4166     UErrorCode    status = U_ZERO_ERROR;
4167     BreakIterator *bi = BreakIterator::createLineInstance(locale, status);
4168     const int32_t  STRSIZE = 50;
4169     UChar         str[STRSIZE];
4170     static const char *strlist[] =
4171     {
4172      "\\u300f\\ufdfc\\ub798\\u2011\\u2011\\u0020\\u0b43\\u002d\\ubeec\\ufffc",
4173      "\\u24ba\\u2060\\u3405\\ub290\\u000d\\U000e0032\\ufe35\\u00a0\\u0361\\"
4174              "U000112ed\\u0f0c\\u000a\\u308e\\ua875\\u0085\\u114d",
4175      "\\ufffc\\u3063\\u2e08\\u30e3\\u000d\\u002d\\u0ed8\\u002f\\U00011a57\\"
4176              "u2014\\U000e0105\\u118c\\u000a\\u07f8",
4177      "\\u0668\\u192b\\u002f\\u2034\\ufe39\\u00b4\\u0cc8\\u2571\\u200b\\u003f",
4178      "\\ufeff\\ufffc\\u3289\\u0085\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4179      "\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0\\u002d\\u30a7\\u17a4",
4180      "\\u2772\\u0020\\U000e010a\\u0020\\u2025\\u000a\\U000e0123",
4181      "\\u002d\\uff1b\\u02c8\\u2029\\ufeff\\u0f22\\u2044\\ufe09\\u003a\\u096d\\u2009\\u000a\\u06f7\\u02cc\\u1019\\u2060",
4182      "\\u1781\\u0b68\\u0f0c\\u3010\\u0085\\U00011f7a\\u0020\\u0dd6\\u200b\\U000e007a\\u000a\\u2060\\u2026\\u002f\\u2026\\u24dc\\u101e\\u2014\\u2007\\u30a5",
4183      "\\u2770\\u0020\\U000e010f\\u0020\\u2060\\u000a\\u02cc\\u0bcc\\u060d\\u30e7\\u0f3b\\u002f",
4184      "\\ufeff\\u0028\\u003b\\U00012fec\\u2010\\u0020\\u0004\\u200b\\u0020\\u275c\\u002f\\u17b1",
4185      "\\u20a9\\u2014\\u00a2\\u31f1\\u002f\\u0020\\u05b8\\u200b\\u0cc2\\u003b\\u060d\\u02c8\\ua4e8\\u002f\\u17d5",
4186      "\\u002d\\u136f\\uff63\\u0084\\ua933\\u2028\\u002d\\u431b\\u200b\\u20b0",
4187      "\\uade3\\u11d6\\u000a\\U0001107d\\u203a\\u201d\\ub070\\u000d\\u2024\\ufffc",
4188      "\\uff5b\\u101c\\u1806\\u002f\\u2213\\uff5f",
4189      "\\u2014\\u0a83\\ufdfc\\u003f\\u00a0\\u0020\\u000a\\u2991\\U0001d179\\u0020\\u201d\\U000125f6\\u0a67\\u20a7\\ufeff\\u043f",
4190      "\\u169b\\U000e0130\\u002d\\u1041\\u0f3d\\u0abf\\u00b0\\u31fb\\u00a0\\u002d\\u02c8\\u003b",
4191      "\\u2762\\u1680\\u002d\\u2028\\u0027\\u01dc\\ufe56\\u003a\\u000a\\uffe6\\u29fd\\u0020\\u30ee\\u007c\\U0001d178\\u0af1\\u0085",
4192      "\\u3010\\u200b\\u2029\\ufeff\\ufe6a\\u275b\\U000e013b\\ufe37\\u24d4\\u002d\\u1806\\u256a\\u1806\\u247c\\u0085\\u17ac",
4193      "\\u99ab\\u0027\\u003b\\u2026\\ueaf0\\u0020\\u0020\\u0313\\u0020\\u3099\\uff09\\u208e\\u2011\\u2007\\u2060\\u000a\\u0020\\u0020\\u300b\\u0bf9",
4194      "\\u1806\\u060d\\u30f5\\u00b4\\u17e9\\u2544\\u2028\\u2024\\u2011\\u20a3\\u002d\\u09cc\\u1782\\u000d\\uff6f\\u0025",
4195      "\\u002f\\uf22e\\u1944\\ufe3d\\u0020\\u206f\\u31b3\\u2014\\u002d\\u2025\\u0f0c\\u0085\\u2763",
4196      "\\u002f\\u2563\\u202f\\u0085\\u17d5\\u200b\\u0020\\U000e0043\\u2014\\u058a\\u3d0a\\ufe57\\u2035\\u2028\\u2029",
4197      "\\u20ae\\U0001d169\\u9293\\uff1f\\uff1f\\u0021\\u2012\\u2039\\u0085\\u02cc\\u00a2\\u0020\\U000e01ab\\u3085\\u0f3a\\u1806\\u0f0c\\u1945\\u000a\\U0001d7e7",
4198      "\\uffe6\\u00a0\\u200b\\u0085\\u2116\\u255b\\U0001d7f7\\u178c\\ufffc",
4199      "\\u02cc\\ufe6a\\u00a0\\u0021\\u002d\\u7490\\uec2e\\u200b\\u000a",
4200      "\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014\\u8945",
4201      "\\u7490\\uec2e\\u200b\\u000a\\u0020\\u2028\\u2014",
4202      "\\u0020\\u2028\\u2014\\u8945\\u002c\\u005b",
4203      "\\u000a\\ufe3c\\u201c\\u000d\\u2025\\u2007\\u201c\\u002d\\u20a0",
4204      "\\u2473\\u0e9d\\u0020\\u0085\\u000a\\ufe3c\\u201c\\u000d\\u2025",
4205      "\\U0001d16e\\ufffc\\u2025\\u0021\\u002d",
4206      "\\ufffc\\u301b\\u0fa5\\U000e0103\\u2060\\u208e\\u17d5\\u034f\\u1009\\u003a\\u180e\\u2009\\u3111",
4207      "\\u2014\\u0020\\u000a\\u17c5\\u24fc",
4208      "\\ufffc\\u0020\\u2116\\uff6c\\u200b\\u0ac3\\U0001028f",
4209      "\\uaeb0\\u0344\\u0085\\ufffc\\u073b\\u2010",
4210      "\\ufeff\\u0589\\u0085\\u0eb8\\u30fd\\u002f\\u003a\\u2014\\ufe43",
4211      "\\u09cc\\u256a\\u276d\\u002d\\u3085\\u000d\\u0e05\\u2028\\u0fbb",
4212      "\\u2034\\u00bb\\u0ae6\\u300c\\u0020\\u31f8\\ufffc",
4213      "\\u2116\\u0ed2\\uff64\\u02cd\\u2001\\u2060",
4214          "\\u809d\\u2e02\\u0f0a\\uc48f\\u2540\\u000d\\u0cef\\u003a\\u0e4d"
4215          "\\U000e0172\\U000e005c\\u17cf\\U00010ca6\\ufeff\\uf621\\u06f3\\uffe5"
4216          "\\u0ea2\\ufeff\\udcea\\u3085\\ua874\\u000a\\u0020\\u000b\\u200b",
4217      "\\ufe10\\u2060\\u1a5a\\u2060\\u17e4\\ufffc\\ubbe1\\ufe15\\u0020\\u00a0",
4218          "\\u2060\\u2213\\u200b\\u2019\\uc2dc\\uff6a\\u1736\\u0085\\udb07",
4219     };
4220     int loop;
4221     TEST_ASSERT_SUCCESS(status);
4222     if (U_FAILURE(status)) {
4223         return;
4224     }
4225     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4226         // printf("looping %d\n", loop);
4227         int32_t t = u_unescape(strlist[loop], str, STRSIZE);
4228         if (t >= STRSIZE) {
4229             TEST_ASSERT(FALSE);
4230             continue;
4231         }
4232
4233
4234         UnicodeString ustr(str);
4235         RBBILineMonkey monkey;
4236         if (U_FAILURE(monkey.deferredStatus)) {
4237             continue;
4238         }
4239
4240         const int EXPECTEDSIZE = 50;
4241         int expected[EXPECTEDSIZE];
4242         int expectedcount = 0;
4243
4244         monkey.setText(ustr);
4245         int i;
4246         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4247             if (expectedcount >= EXPECTEDSIZE) {
4248                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4249                 return;
4250             }
4251             expected[expectedcount ++] = i;
4252         }
4253
4254         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4255     }
4256     delete bi;
4257 #endif
4258 }
4259
4260 void RBBITest::TestSentBreaks(void)
4261 {
4262 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4263     Locale        locale("en");
4264     UErrorCode    status = U_ZERO_ERROR;
4265     BreakIterator *bi = BreakIterator::createSentenceInstance(locale, status);
4266     UChar         str[200];
4267     static const char *strlist[] =
4268     {
4269      "Now\ris\nthe\r\ntime\n\rfor\r\r",
4270      "This\n",
4271      "Hello! how are you? I'am fine. Thankyou. How are you doing? This\n costs $20,00,000.",
4272      "\"Sentence ending with a quote.\" Bye.",
4273      "  (This is it).  Testing the sentence iterator. \"This isn't it.\"",
4274      "Hi! This is a simple sample sentence. (This is it.) This is a simple sample sentence. \"This isn't it.\"",
4275      "Hi! This is a simple sample sentence. It does not have to make any sense as you can see. ",
4276      "Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. ",
4277      "Che la dritta via aveo smarrita. He said, that I said, that you said!! ",
4278      "Don't rock the boat.\\u2029Because I am the daddy, that is why. Not on my time (el timo.)!",
4279      "\\U0001040a\\u203a\\u1217\\u2b23\\u000d\\uff3b\\u03dd\\uff57\\u0a69\\u104a\\ufe56\\ufe52"
4280              "\\u3016\\U000e002f\\U000e0077\\u0662\\u1680\\u2984\\U000e006a\\u002e\\ua6ab\\u104a"
4281              "\\u002e\\u019b\\u2005\\u002e\\u0477\\u0438\\u0085\\u0441\\u002e\\u5f61\\u202f"
4282              "\\U0001019f\\uff08\\u27e8\\u055c\\u0352",
4283      "\\u1f3e\\u004d\\u000a\\ua3e4\\U000e0023\\uff63\\u0c52\\u276d\\U0001d5de\\U0001d171"
4284              "\\u0e38\\u17e5\\U00012fe6\\u0fa9\\u267f\\u1da3\\u0046\\u03ed\\udc72\\u0030"
4285              "\\U0001d688\\u0b6d\\u0085\\u0c67\\u1f94\\u0c6c\\u9cb2\\u202a\\u180e\\u000b"
4286              "\\u002e\\U000e005e\\u035b\\u061f\\u02c1\\U000e0025\\u0357\\u0969\\u202b"
4287              "\\U000130c5\\u0486\\U000e0123\\u2019\\u01bc\\u2006\\u11ad\\u180e\\u2e05"
4288              "\\u10b7\\u013e\\u000a\\u002e\\U00013ea4"
4289     };
4290     int loop;
4291     if (U_FAILURE(status)) {
4292         errcheckln(status, "Creation of break iterator failed %s", u_errorName(status));
4293         return;
4294     }
4295     for (loop = 0; loop < (int)(sizeof(strlist) / sizeof(char *)); loop ++) {
4296         u_unescape(strlist[loop], str, (int32_t)(sizeof(str) / sizeof(str[0])));
4297         UnicodeString ustr(str);
4298
4299         RBBISentMonkey monkey;
4300         if (U_FAILURE(monkey.deferredStatus)) {
4301             continue;
4302         }
4303
4304         const int EXPECTEDSIZE = 50;
4305         int expected[EXPECTEDSIZE];
4306         int expectedcount = 0;
4307
4308         monkey.setText(ustr);
4309         int i;
4310         for (i = 0; i != BreakIterator::DONE; i = monkey.next(i)) {
4311             if (expectedcount >= EXPECTEDSIZE) {
4312                 TEST_ASSERT(expectedcount < EXPECTEDSIZE);
4313                 return;
4314             }
4315             expected[expectedcount ++] = i;
4316         }
4317
4318         testBreakBoundPreceding(this, ustr, bi, expected, expectedcount);
4319     }
4320     delete bi;
4321 #endif
4322 }
4323
4324 void RBBITest::TestMonkey(char *params) {
4325 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4326
4327     UErrorCode     status    = U_ZERO_ERROR;
4328     int32_t        loopCount = 500;
4329     int32_t        seed      = 1;
4330     UnicodeString  breakType = "all";
4331     Locale         locale("en");
4332     UBool          useUText  = FALSE;
4333
4334     if (quick == FALSE) {
4335         loopCount = 10000;
4336     }
4337
4338     if (params) {
4339         UnicodeString p(params);
4340         loopCount = getIntParam("loop", p, loopCount);
4341         seed      = getIntParam("seed", p, seed);
4342
4343         RegexMatcher m(" *type *= *(char|word|line|sent|title) *", p, 0, status);
4344         if (m.find()) {
4345             breakType = m.group(1, status);
4346             m.reset();
4347             p = m.replaceFirst("", status);
4348         }
4349
4350         RegexMatcher u(" *utext", p, 0, status);
4351         if (u.find()) {
4352             useUText = TRUE;
4353             u.reset();
4354             p = u.replaceFirst("", status);
4355         }
4356
4357
4358         // m.reset(p);
4359         if (RegexMatcher(UNICODE_STRING_SIMPLE("\\S"), p, 0, status).find()) {
4360             // Each option is stripped out of the option string as it is processed.
4361             // All options have been checked.  The option string should have been completely emptied..
4362             char buf[100];
4363             p.extract(buf, sizeof(buf), NULL, status);
4364             buf[sizeof(buf)-1] = 0;
4365             errln("Unrecognized or extra parameter:  %s\n", buf);
4366             return;
4367         }
4368
4369     }
4370
4371     if (breakType == "char" || breakType == "all") {
4372         RBBICharMonkey  m;
4373         BreakIterator  *bi = BreakIterator::createCharacterInstance(locale, status);
4374         if (U_SUCCESS(status)) {
4375             RunMonkey(bi, m, "char", seed, loopCount, useUText);
4376             if (breakType == "all" && useUText==FALSE) {
4377                 // Also run a quick test with UText when "all" is specified
4378                 RunMonkey(bi, m, "char", seed, loopCount, TRUE);
4379             }
4380         }
4381         else {
4382             errcheckln(status, "Creation of character break iterator failed %s", u_errorName(status));
4383         }
4384         delete bi;
4385     }
4386
4387     if (breakType == "word" || breakType == "all") {
4388         logln("Word Break Monkey Test");
4389         RBBIWordMonkey  m;
4390         BreakIterator  *bi = BreakIterator::createWordInstance(locale, status);
4391         if (U_SUCCESS(status)) {
4392             RunMonkey(bi, m, "word", seed, loopCount, useUText);
4393         }
4394         else {
4395             errcheckln(status, "Creation of word break iterator failed %s", u_errorName(status));
4396         }
4397         delete bi;
4398     }
4399
4400     if (breakType == "line" || breakType == "all") {
4401         logln("Line Break Monkey Test");
4402         RBBILineMonkey  m;
4403         BreakIterator  *bi = BreakIterator::createLineInstance(locale, status);
4404         if (loopCount >= 10) {
4405             loopCount = loopCount / 5;   // Line break runs slower than the others.
4406         }
4407         if (U_SUCCESS(status)) {
4408             RunMonkey(bi, m, "line", seed, loopCount, useUText);
4409         }
4410         else {
4411             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4412         }
4413         delete bi;
4414     }
4415
4416     if (breakType == "sent" || breakType == "all"  ) {
4417         logln("Sentence Break Monkey Test");
4418         RBBISentMonkey  m;
4419         BreakIterator  *bi = BreakIterator::createSentenceInstance(locale, status);
4420         if (loopCount >= 10) {
4421             loopCount = loopCount / 10;   // Sentence runs slower than the other break types
4422         }
4423         if (U_SUCCESS(status)) {
4424             RunMonkey(bi, m, "sentence", seed, loopCount, useUText);
4425         }
4426         else {
4427             errcheckln(status, "Creation of line break iterator failed %s", u_errorName(status));
4428         }
4429         delete bi;
4430     }
4431
4432 #endif
4433 }
4434
4435 //
4436 //  Run a RBBI monkey test.  Common routine, for all break iterator types.
4437 //    Parameters:
4438 //       bi      - the break iterator to use
4439 //       mk      - MonkeyKind, abstraction for obtaining expected results
4440 //       name    - Name of test (char, word, etc.) for use in error messages
4441 //       seed    - Seed for starting random number generator (parameter from user)
4442 //       numIterations
4443 //
4444 void RBBITest::RunMonkey(BreakIterator *bi, RBBIMonkeyKind &mk, const char *name, uint32_t  seed,
4445                          int32_t numIterations, UBool useUText) {
4446
4447 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
4448
4449     const int32_t    TESTSTRINGLEN = 500;
4450     UnicodeString    testText;
4451     int32_t          numCharClasses;
4452     UVector          *chClasses;
4453     int              expected[TESTSTRINGLEN*2 + 1];
4454     int              expectedCount = 0;
4455     char             expectedBreaks[TESTSTRINGLEN*2 + 1];
4456     char             forwardBreaks[TESTSTRINGLEN*2 + 1];
4457     char             reverseBreaks[TESTSTRINGLEN*2+1];
4458     char             isBoundaryBreaks[TESTSTRINGLEN*2+1];
4459     char             followingBreaks[TESTSTRINGLEN*2+1];
4460     char             precedingBreaks[TESTSTRINGLEN*2+1];
4461     int              i;
4462     int              loopCount = 0;
4463
4464     m_seed = seed;
4465
4466     numCharClasses = mk.charClasses()->size();
4467     chClasses      = mk.charClasses();
4468
4469     // Check for errors that occured during the construction of the MonkeyKind object.
4470     //  Can't report them where they occured because errln() is a method coming from intlTest,
4471     //  and is not visible outside of RBBITest :-(
4472     if (U_FAILURE(mk.deferredStatus)) {
4473         errln("status of \"%s\" in creation of RBBIMonkeyKind.", u_errorName(mk.deferredStatus));
4474         return;
4475     }
4476
4477     // Verify that the character classes all have at least one member.
4478     for (i=0; i<numCharClasses; i++) {
4479         UnicodeSet *s = (UnicodeSet *)chClasses->elementAt(i);
4480         if (s == NULL || s->size() == 0) {
4481             errln("Character Class #%d is null or of zero size.", i);
4482             return;
4483         }
4484     }
4485
4486     while (loopCount < numIterations || numIterations == -1) {
4487         if (numIterations == -1 && loopCount % 10 == 0) {
4488             // If test is running in an infinite loop, display a periodic tic so
4489             //   we can tell that it is making progress.
4490             fprintf(stderr, ".");
4491         }
4492         // Save current random number seed, so that we can recreate the random numbers
4493         //   for this loop iteration in event of an error.
4494         seed = m_seed;
4495
4496         // Populate a test string with data.
4497         testText.truncate(0);
4498         for (i=0; i<TESTSTRINGLEN; i++) {
4499             int32_t  aClassNum = m_rand() % numCharClasses;
4500             UnicodeSet *classSet = (UnicodeSet *)chClasses->elementAt(aClassNum);
4501             int32_t   charIdx = m_rand() % classSet->size();
4502             UChar32   c = classSet->charAt(charIdx);
4503             if (c < 0) {   // TODO:  deal with sets containing strings.
4504                 errln("c < 0");
4505                 break;
4506             }
4507             testText.append(c);
4508         }
4509
4510         // Calculate the expected results for this test string.
4511         mk.setText(testText);
4512         memset(expectedBreaks, 0, sizeof(expectedBreaks));
4513         expectedBreaks[0] = 1;
4514         int32_t breakPos = 0;
4515         expectedCount = 0;
4516         for (;;) {
4517             breakPos = mk.next(breakPos);
4518             if (breakPos == -1) {
4519                 break;
4520             }
4521             if (breakPos > testText.length()) {
4522                 errln("breakPos > testText.length()");
4523             }
4524             expectedBreaks[breakPos] = 1;
4525             U_ASSERT(expectedCount<testText.length());
4526             expected[expectedCount ++] = breakPos;
4527         }
4528
4529         // Find the break positions using forward iteration
4530         memset(forwardBreaks, 0, sizeof(forwardBreaks));
4531         if (useUText) {
4532             UErrorCode status = U_ZERO_ERROR;
4533             UText *testUText = utext_openReplaceable(NULL, &testText, &status);
4534             // testUText = utext_openUnicodeString(testUText, &testText, &status);
4535             bi->setText(testUText, status);
4536             TEST_ASSERT_SUCCESS(status);
4537             utext_close(testUText);   // The break iterator does a shallow clone of the UText
4538                                       //  This UText can be closed immediately, so long as the
4539                                       //  testText string continues to exist.
4540         } else {
4541             bi->setText(testText);
4542         }
4543
4544         for (i=bi->first(); i != BreakIterator::DONE; i=bi->next()) {
4545             if (i < 0 || i > testText.length()) {
4546                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4547                 break;
4548             }
4549             forwardBreaks[i] = 1;
4550         }
4551
4552         // Find the break positions using reverse iteration
4553         memset(reverseBreaks, 0, sizeof(reverseBreaks));
4554         for (i=bi->last(); i != BreakIterator::DONE; i=bi->previous()) {
4555             if (i < 0 || i > testText.length()) {
4556                 errln("%s break monkey test: Out of range value returned by breakIterator::next()", name);
4557                 break;
4558             }
4559             reverseBreaks[i] = 1;
4560         }
4561
4562         // Find the break positions using isBoundary() tests.
4563         memset(isBoundaryBreaks, 0, sizeof(isBoundaryBreaks));
4564         U_ASSERT((int32_t)sizeof(isBoundaryBreaks) > testText.length());
4565         for (i=0; i<=testText.length(); i++) {
4566             isBoundaryBreaks[i] = bi->isBoundary(i);
4567         }
4568
4569
4570         // Find the break positions using the following() function.
4571         // printf(".");
4572         memset(followingBreaks, 0, sizeof(followingBreaks));
4573         int32_t   lastBreakPos = 0;
4574         followingBreaks[0] = 1;
4575         for (i=0; i<testText.length(); i++) {
4576             breakPos = bi->following(i);
4577             if (breakPos <= i ||
4578                 breakPos < lastBreakPos ||
4579                 breakPos > testText.length() ||
4580                 (breakPos > lastBreakPos && lastBreakPos > i)) {
4581                 errln("%s break monkey test: "
4582                     "Out of range value returned by BreakIterator::following().\n"
4583                         "Random seed=%d  index=%d; following returned %d;  lastbreak=%d",
4584                          name, seed, i, breakPos, lastBreakPos);
4585                 break;
4586             }
4587             followingBreaks[breakPos] = 1;
4588             lastBreakPos = breakPos;
4589         }
4590
4591         // Find the break positions using the preceding() function.
4592         memset(precedingBreaks, 0, sizeof(precedingBreaks));
4593         lastBreakPos = testText.length();
4594         precedingBreaks[testText.length()] = 1;
4595         for (i=testText.length(); i>0; i--) {
4596             breakPos = bi->preceding(i);
4597             if (breakPos >= i ||
4598                 breakPos > lastBreakPos ||
4599                 (breakPos < 0 && testText.getChar32Start(i)>0) ||
4600                 (breakPos < lastBreakPos && lastBreakPos < testText.getChar32Start(i)) ) {
4601                 errln("%s break monkey test: "
4602                     "Out of range value returned by BreakIterator::preceding().\n"
4603                     "index=%d;  prev returned %d; lastBreak=%d" ,
4604                     name,  i, breakPos, lastBreakPos);
4605                 if (breakPos >= 0 && breakPos < (int32_t)sizeof(precedingBreaks)) {
4606                     precedingBreaks[i] = 2;   // Forces an error.
4607                 }
4608             } else {
4609                 if (breakPos >= 0) {
4610                     precedingBreaks[breakPos] = 1;
4611                 }
4612                 lastBreakPos = breakPos;
4613             }
4614         }
4615
4616         // Compare the expected and actual results.
4617         for (i=0; i<=testText.length(); i++) {
4618             const char *errorType = NULL;
4619             if  (forwardBreaks[i] != expectedBreaks[i]) {
4620                 errorType = "next()";
4621             } else if (reverseBreaks[i] != forwardBreaks[i]) {
4622                 errorType = "previous()";
4623             } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
4624                 errorType = "isBoundary()";
4625             } else if (followingBreaks[i] != expectedBreaks[i]) {
4626                 errorType = "following()";
4627             } else if (precedingBreaks[i] != expectedBreaks[i]) {
4628                 errorType = "preceding()";
4629             }
4630
4631
4632             if (errorType != NULL) {
4633                 // Format a range of the test text that includes the failure as
4634                 //  a data item that can be included in the rbbi test data file.
4635
4636                 // Start of the range is the last point where expected and actual results
4637                 //   both agreed that there was a break position.
4638                 int startContext = i;
4639                 int32_t count = 0;
4640                 for (;;) {
4641                     if (startContext==0) { break; }
4642                     startContext --;
4643                     if (expectedBreaks[startContext] != 0) {
4644                         if (count == 2) break;
4645                         count ++;
4646                     }
4647                 }
4648
4649                 // End of range is two expected breaks past the start position.
4650                 int endContext = i + 1;
4651                 int ci;
4652                 for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
4653                     for (;;) {
4654                         if (endContext >= testText.length()) {break;}
4655                         if (expectedBreaks[endContext-1] != 0) {
4656                             if (count == 0) break;
4657                             count --;
4658                         }
4659                         endContext ++;
4660                     }
4661                 }
4662
4663                 // Format looks like   "<data>\\\uabcd\uabcd\\\U0001abcd...</data>"
4664                 UnicodeString errorText = "<data>";
4665                 /***if (strcmp(errorType, "next()") == 0) {
4666                     startContext = 0;
4667                     endContext = testText.length();
4668
4669                     printStringBreaks(testText, expected, expectedCount);
4670                 }***/
4671
4672                 for (ci=startContext; ci<endContext;) {
4673                     UnicodeString hexChars("0123456789abcdef");
4674                     UChar32  c;
4675                     int      bn;
4676                     c = testText.char32At(ci);
4677                     if (ci == i) {
4678                         // This is the location of the error.
4679                         errorText.append("<?>");
4680                     } else if (expectedBreaks[ci] != 0) {
4681                         // This a non-error expected break position.
4682                         errorText.append("\\");
4683                     }
4684                     if (c < 0x10000) {
4685                         errorText.append("\\u");
4686                         for (bn=12; bn>=0; bn-=4) {
4687                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4688                         }
4689                     } else {
4690                         errorText.append("\\U");
4691                         for (bn=28; bn>=0; bn-=4) {
4692                             errorText.append(hexChars.charAt((c>>bn)&0xf));
4693                         }
4694                     }
4695                     ci = testText.moveIndex32(ci, 1);
4696                 }
4697                 errorText.append("\\");
4698                 errorText.append("</data>\n");
4699
4700                 // Output the error
4701                 char  charErrorTxt[500];
4702                 UErrorCode status = U_ZERO_ERROR;
4703                 errorText.extract(charErrorTxt, sizeof(charErrorTxt), NULL, status);
4704                 charErrorTxt[sizeof(charErrorTxt)-1] = 0;
4705                 errln("%s break monkey test error.  %s. Operation = %s; Random seed = %d;  buf Idx = %d\n%s",
4706                     name, (expectedBreaks[i]? "break expected but not found" : "break found but not expected"),
4707                     errorType, seed, i, charErrorTxt);
4708                 break;
4709             }
4710         }
4711
4712         loopCount++;
4713     }
4714 #endif
4715 }
4716
4717
4718 //  Bug 5532.  UTF-8 based UText fails in dictionary code.
4719 //             This test checks the initial patch,
4720 //             which is to just keep it from crashing.  Correct word boundaries
4721 //             await a proper fix to the dictionary code.
4722 //
4723 void RBBITest::TestBug5532(void)  {
4724    // Text includes a mixture of Thai and Latin.
4725    const unsigned char utf8Data[] = {
4726            0xE0u, 0xB8u, 0x82u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0xA2u, 0xE0u,
4727            0xB9u, 0x80u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u, 0xA3u, 0xE0u, 0xB8u,
4728            0xB7u, 0xE0u, 0xB9u, 0x88u, 0xE0u, 0xB8u, 0xADu, 0xE0u, 0xB8u, 0x87u,
4729            0xE0u, 0xB9u, 0x80u, 0xE0u, 0xB8u, 0xA5u, 0xE0u, 0xB9u, 0x88u, 0xE0u,
4730            0xB8u, 0x99u, 0xE0u, 0xB8u, 0x8Bu, 0xE0u, 0xB8u, 0xB5u, 0xE0u, 0xB8u,
4731            0x94u, 0xE0u, 0xB8u, 0xB5u, 0x20u, 0x73u, 0x69u, 0x6Du, 0x20u, 0x61u,
4732            0x75u, 0x64u, 0x69u, 0x6Fu, 0x2Fu, 0x20u, 0x4Du, 0x4Fu, 0x4Fu, 0x4Eu,
4733            0x20u, 0x65u, 0x63u, 0x6Cu, 0x69u, 0x70u, 0x73u, 0x65u, 0x20u, 0xE0u,
4734            0xB8u, 0xA3u, 0xE0u, 0xB8u, 0xB2u, 0xE0u, 0xB8u, 0x84u, 0xE0u, 0xB8u,
4735            0xB2u, 0x20u, 0x34u, 0x37u, 0x30u, 0x30u, 0x20u, 0xE0u, 0xB8u, 0xA2u,
4736            0xE0u, 0xB8u, 0xB9u, 0xE0u, 0xB9u, 0x82u, 0xE0u, 0xB8u, 0xA3u, 0x00};
4737
4738     UErrorCode status = U_ZERO_ERROR;
4739     UText utext=UTEXT_INITIALIZER;
4740     utext_openUTF8(&utext, (const char *)utf8Data, -1, &status);
4741     TEST_ASSERT_SUCCESS(status);
4742
4743     BreakIterator *bi = BreakIterator::createWordInstance(Locale("th"), status);
4744     TEST_ASSERT_SUCCESS(status);
4745     if (U_SUCCESS(status)) {
4746         bi->setText(&utext, status);
4747         TEST_ASSERT_SUCCESS(status);
4748
4749         int32_t breakCount = 0;
4750         int32_t previousBreak = -1;
4751         for (bi->first(); bi->next() != BreakIterator::DONE; breakCount++) {
4752             // For now, just make sure that the break iterator doesn't hang.
4753             TEST_ASSERT(previousBreak < bi->current());
4754             previousBreak = bi->current();
4755         }
4756         TEST_ASSERT(breakCount > 0);
4757     }
4758     delete bi;
4759     utext_close(&utext);
4760 }
4761
4762
4763 //
4764 //  TestDebug    -  A place-holder test for debugging purposes.
4765 //                  For putting in fragments of other tests that can be invoked
4766 //                  for tracing  without a lot of unwanted extra stuff happening.
4767 //
4768 void RBBITest::TestDebug(void) {
4769 #if 0
4770     UErrorCode   status = U_ZERO_ERROR;
4771     int pos = 0;
4772     int ruleStatus = 0;
4773
4774     RuleBasedBreakIterator* bi =
4775        // (RuleBasedBreakIterator *)BreakIterator::createLineInstance(Locale::getDefault(), status);
4776        // (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::Locale("th"), status);
4777        (RuleBasedBreakIterator *)BreakIterator::createSentenceInstance(Locale::getDefault(), status);
4778     UnicodeString s("\\u2008\\u002e\\udc6a\\u37cd\\u71d0\\u2048\\U000e006a\\u002e\\u0046\\ufd3f\\u000a\\u002e");
4779     // UnicodeString s("Aaa.  Bcd");
4780     s = s.unescape();
4781     bi->setText(s);
4782     UBool r = bi->isBoundary(8);
4783     printf("%s", r?"true":"false");
4784     return;
4785     pos = bi->last();
4786     do {
4787         // ruleStatus = bi->getRuleStatus();
4788         printf("%d\t%d\n", pos, ruleStatus);
4789         pos = bi->previous();
4790     } while (pos != BreakIterator::DONE);
4791 #endif
4792 }
4793
4794 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */